Loading our dataset into a dataframe.

Telecom_Data <- data.frame(read.csv("Telecom Data.csv"))
ncol(Telecom_Data)
## [1] 58
nrow(Telecom_Data)
## [1] 51047

There are total 58 Columns and 51,047 Rows.

Let us print the structure of our data.

str(Telecom_Data)
## 'data.frame':    51047 obs. of  58 variables:
##  $ CustomerID               : int  3000002 3000010 3000014 3000022 3000026 3000030 3000038 3000042 3000046 3000050 ...
##  $ Churn                    : chr  "Yes" "Yes" "No" "No" ...
##  $ MonthlyRevenue           : num  24 17 38 82.3 17.1 ...
##  $ MonthlyMinutes           : int  219 10 8 1312 0 682 26 98 24 1056 ...
##  $ TotalRecurringCharge     : int  22 17 38 75 17 52 30 66 35 75 ...
##  $ DirectorAssistedCalls    : num  0.25 0 0 1.24 0 0.25 0.25 2.48 0 0 ...
##  $ OverageMinutes           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RoamingCalls             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PercChangeMinutes        : int  -157 -4 -2 157 0 148 60 24 20 43 ...
##  $ PercChangeRevenues       : num  -19 0 0 8.1 -0.2 -3.1 4 6.8 -0.3 2.4 ...
##  $ DroppedCalls             : num  0.7 0.3 0 52 0 9 0 0 0 0 ...
##  $ BlockedCalls             : num  0.7 0 0 7.7 0 1.7 1 0.3 0 0 ...
##  $ UnansweredCalls          : num  6.3 2.7 0 76 0 13 2.3 4 1 0 ...
##  $ CustomerCareCalls        : num  0 0 0 4.3 0 0.7 0 4 0 0 ...
##  $ ThreewayCalls            : num  0 0 0 1.3 0 0 0 0 0 0 ...
##  $ ReceivedCalls            : num  97.2 0 0.4 200.3 0 ...
##  $ OutboundCalls            : num  0 0 0.3 370.3 0 ...
##  $ InboundCalls             : num  0 0 0 147 0 0 0 0 1.7 0 ...
##  $ PeakCallsInOut           : num  58 5 1.3 555.7 0 ...
##  $ OffPeakCallsInOut        : num  24 1 3.7 303.7 0 ...
##  $ DroppedBlockedCalls      : num  1.3 0.3 0 59.7 0 10.7 1 0.3 0 0 ...
##  $ CallForwardingCalls      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CallWaitingCalls         : num  0.3 0 0 22.7 0 0.7 0 0 0 0 ...
##  $ MonthsInService          : int  61 58 60 59 53 53 57 59 53 55 ...
##  $ UniqueSubs               : int  2 1 1 2 2 1 2 2 3 1 ...
##  $ ActiveSubs               : int  1 1 1 2 2 1 2 2 3 1 ...
##  $ ServiceArea              : chr  "SEAPOR503" "PITHOM412" "MILMIL414" "PITHOM412" ...
##  $ Handsets                 : int  2 2 1 9 4 3 2 3 4 9 ...
##  $ HandsetModels            : int  2 1 1 4 3 2 2 3 3 5 ...
##  $ CurrentEquipmentDays     : int  361 1504 1812 458 852 231 601 464 544 388 ...
##  $ AgeHH1                   : int  62 40 26 30 46 28 52 46 36 46 ...
##  $ AgeHH2                   : int  0 42 26 0 54 0 58 46 34 68 ...
##  $ ChildrenInHH             : chr  "No" "Yes" "Yes" "No" ...
##  $ HandsetRefurbished       : chr  "No" "No" "No" "No" ...
##  $ HandsetWebCapable        : chr  "Yes" "No" "No" "Yes" ...
##  $ TruckOwner               : chr  "No" "No" "No" "No" ...
##  $ RVOwner                  : chr  "No" "No" "No" "No" ...
##  $ Homeownership            : chr  "Known" "Known" "Unknown" "Known" ...
##  $ BuysViaMailOrder         : chr  "Yes" "Yes" "No" "Yes" ...
##  $ RespondsToMailOffers     : chr  "Yes" "Yes" "No" "Yes" ...
##  $ OptOutMailings           : chr  "No" "No" "No" "No" ...
##  $ NonUSTravel              : chr  "No" "No" "No" "No" ...
##  $ OwnsComputer             : chr  "Yes" "Yes" "No" "No" ...
##  $ HasCreditCard            : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ RetentionCalls           : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ RetentionOffersAccepted  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NewCellphoneUser         : chr  "No" "Yes" "Yes" "Yes" ...
##  $ NotNewCellphoneUser      : chr  "No" "No" "No" "No" ...
##  $ ReferralsMadeBySubscriber: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ IncomeGroup              : int  4 5 6 6 9 1 9 6 9 5 ...
##  $ OwnsMotorcycle           : chr  "No" "No" "No" "No" ...
##  $ AdjustmentsToCreditRating: int  0 0 0 0 1 1 1 0 0 1 ...
##  $ HandsetPrice             : chr  "30" "30" "Unknown" "10" ...
##  $ MadeCallToRetentionTeam  : chr  "Yes" "No" "No" "No" ...
##  $ CreditRating             : chr  "1-Highest" "4-Medium" "3-Good" "4-Medium" ...
##  $ PrizmCode                : chr  "Suburban" "Suburban" "Town" "Other" ...
##  $ Occupation               : chr  "Professional" "Professional" "Crafts" "Other" ...
##  $ MaritalStatus            : chr  "No" "Yes" "Yes" "No" ...

Here we are converting a few columns to factor data type.

#Telecom_Data$Churn <- factor(Telecom_Data$Churn)
Telecom_Data$CreditRating <- factor(Telecom_Data$CreditRating) 
Telecom_Data$Occupation <- factor(Telecom_Data$Occupation)

Getting the summary of our data.

#xkablesummary(Telecom_Data)

Let’s check for null values

library(dplyr)
library(tidyr)
## Checking the null values in the dataset
#summary(Telecom_Data)
#is.null(Telecom_Data)
null_values<-sapply(Telecom_Data, function(x) sum(is.na(x)))
null_values
##                CustomerID                     Churn            MonthlyRevenue 
##                         0                         0                       156 
##            MonthlyMinutes      TotalRecurringCharge     DirectorAssistedCalls 
##                       156                       156                       156 
##            OverageMinutes              RoamingCalls         PercChangeMinutes 
##                       156                       156                       367 
##        PercChangeRevenues              DroppedCalls              BlockedCalls 
##                       367                         0                         0 
##           UnansweredCalls         CustomerCareCalls             ThreewayCalls 
##                         0                         0                         0 
##             ReceivedCalls             OutboundCalls              InboundCalls 
##                         0                         0                         0 
##            PeakCallsInOut         OffPeakCallsInOut       DroppedBlockedCalls 
##                         0                         0                         0 
##       CallForwardingCalls          CallWaitingCalls           MonthsInService 
##                         0                         0                         0 
##                UniqueSubs                ActiveSubs               ServiceArea 
##                         0                         0                         0 
##                  Handsets             HandsetModels      CurrentEquipmentDays 
##                         1                         1                         1 
##                    AgeHH1                    AgeHH2              ChildrenInHH 
##                       909                       909                         0 
##        HandsetRefurbished         HandsetWebCapable                TruckOwner 
##                         0                         0                         0 
##                   RVOwner             Homeownership          BuysViaMailOrder 
##                         0                         0                         0 
##      RespondsToMailOffers            OptOutMailings               NonUSTravel 
##                         0                         0                         0 
##              OwnsComputer             HasCreditCard            RetentionCalls 
##                         0                         0                         0 
##   RetentionOffersAccepted          NewCellphoneUser       NotNewCellphoneUser 
##                         0                         0                         0 
## ReferralsMadeBySubscriber               IncomeGroup            OwnsMotorcycle 
##                         0                         0                         0 
## AdjustmentsToCreditRating              HandsetPrice   MadeCallToRetentionTeam 
##                         0                         0                         0 
##              CreditRating                 PrizmCode                Occupation 
##                         0                         0                         0 
##             MaritalStatus 
##                         0

Few columns have null values but the count is less,

Lets create new variables which will help with our analysis.

##Creation of new variables for our analysis
Telecom_Data$perc_recurrent_charge <- (Telecom_Data$TotalRecurringCharge /Telecom_Data$MonthlyRevenue) * 100

Telecom_Data$perc_overage_minute <- (Telecom_Data$OverageMinutes / Telecom_Data$MonthlyMinutes) * 100

str(Telecom_Data)
## 'data.frame':    51047 obs. of  60 variables:
##  $ CustomerID               : int  3000002 3000010 3000014 3000022 3000026 3000030 3000038 3000042 3000046 3000050 ...
##  $ Churn                    : chr  "Yes" "Yes" "No" "No" ...
##  $ MonthlyRevenue           : num  24 17 38 82.3 17.1 ...
##  $ MonthlyMinutes           : int  219 10 8 1312 0 682 26 98 24 1056 ...
##  $ TotalRecurringCharge     : int  22 17 38 75 17 52 30 66 35 75 ...
##  $ DirectorAssistedCalls    : num  0.25 0 0 1.24 0 0.25 0.25 2.48 0 0 ...
##  $ OverageMinutes           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RoamingCalls             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PercChangeMinutes        : int  -157 -4 -2 157 0 148 60 24 20 43 ...
##  $ PercChangeRevenues       : num  -19 0 0 8.1 -0.2 -3.1 4 6.8 -0.3 2.4 ...
##  $ DroppedCalls             : num  0.7 0.3 0 52 0 9 0 0 0 0 ...
##  $ BlockedCalls             : num  0.7 0 0 7.7 0 1.7 1 0.3 0 0 ...
##  $ UnansweredCalls          : num  6.3 2.7 0 76 0 13 2.3 4 1 0 ...
##  $ CustomerCareCalls        : num  0 0 0 4.3 0 0.7 0 4 0 0 ...
##  $ ThreewayCalls            : num  0 0 0 1.3 0 0 0 0 0 0 ...
##  $ ReceivedCalls            : num  97.2 0 0.4 200.3 0 ...
##  $ OutboundCalls            : num  0 0 0.3 370.3 0 ...
##  $ InboundCalls             : num  0 0 0 147 0 0 0 0 1.7 0 ...
##  $ PeakCallsInOut           : num  58 5 1.3 555.7 0 ...
##  $ OffPeakCallsInOut        : num  24 1 3.7 303.7 0 ...
##  $ DroppedBlockedCalls      : num  1.3 0.3 0 59.7 0 10.7 1 0.3 0 0 ...
##  $ CallForwardingCalls      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CallWaitingCalls         : num  0.3 0 0 22.7 0 0.7 0 0 0 0 ...
##  $ MonthsInService          : int  61 58 60 59 53 53 57 59 53 55 ...
##  $ UniqueSubs               : int  2 1 1 2 2 1 2 2 3 1 ...
##  $ ActiveSubs               : int  1 1 1 2 2 1 2 2 3 1 ...
##  $ ServiceArea              : chr  "SEAPOR503" "PITHOM412" "MILMIL414" "PITHOM412" ...
##  $ Handsets                 : int  2 2 1 9 4 3 2 3 4 9 ...
##  $ HandsetModels            : int  2 1 1 4 3 2 2 3 3 5 ...
##  $ CurrentEquipmentDays     : int  361 1504 1812 458 852 231 601 464 544 388 ...
##  $ AgeHH1                   : int  62 40 26 30 46 28 52 46 36 46 ...
##  $ AgeHH2                   : int  0 42 26 0 54 0 58 46 34 68 ...
##  $ ChildrenInHH             : chr  "No" "Yes" "Yes" "No" ...
##  $ HandsetRefurbished       : chr  "No" "No" "No" "No" ...
##  $ HandsetWebCapable        : chr  "Yes" "No" "No" "Yes" ...
##  $ TruckOwner               : chr  "No" "No" "No" "No" ...
##  $ RVOwner                  : chr  "No" "No" "No" "No" ...
##  $ Homeownership            : chr  "Known" "Known" "Unknown" "Known" ...
##  $ BuysViaMailOrder         : chr  "Yes" "Yes" "No" "Yes" ...
##  $ RespondsToMailOffers     : chr  "Yes" "Yes" "No" "Yes" ...
##  $ OptOutMailings           : chr  "No" "No" "No" "No" ...
##  $ NonUSTravel              : chr  "No" "No" "No" "No" ...
##  $ OwnsComputer             : chr  "Yes" "Yes" "No" "No" ...
##  $ HasCreditCard            : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ RetentionCalls           : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ RetentionOffersAccepted  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NewCellphoneUser         : chr  "No" "Yes" "Yes" "Yes" ...
##  $ NotNewCellphoneUser      : chr  "No" "No" "No" "No" ...
##  $ ReferralsMadeBySubscriber: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ IncomeGroup              : int  4 5 6 6 9 1 9 6 9 5 ...
##  $ OwnsMotorcycle           : chr  "No" "No" "No" "No" ...
##  $ AdjustmentsToCreditRating: int  0 0 0 0 1 1 1 0 0 1 ...
##  $ HandsetPrice             : chr  "30" "30" "Unknown" "10" ...
##  $ MadeCallToRetentionTeam  : chr  "Yes" "No" "No" "No" ...
##  $ CreditRating             : Factor w/ 7 levels "1-Highest","2-High",..: 1 4 3 4 1 3 1 1 1 3 ...
##  $ PrizmCode                : chr  "Suburban" "Suburban" "Town" "Other" ...
##  $ Occupation               : Factor w/ 8 levels "Clerical","Crafts",..: 5 5 2 4 5 4 7 5 4 5 ...
##  $ MaritalStatus            : chr  "No" "Yes" "Yes" "No" ...
##  $ perc_recurrent_charge    : num  91.7 100.1 100 91.2 99.2 ...
##  $ perc_overage_minute      : num  0 0 0 0 NaN 0 0 0 0 0 ...

Let us check the count for churn.

## Getting Churn counts 
churn_counts<- dplyr::count(Telecom_Data,Churn , sort = TRUE)

We are plotting a bar chart here to see the churn distribution.

library("ggplot2")
ggplot(data = churn_counts, aes(x = "", y = n, fill = Churn)) + 
  geom_bar(stat = "identity") + 
  coord_polar("y")

Trying out plotly for pie chart for more interactive graphs.

library(plotly)
colors <- c('rgb(211,94,96)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)')


fig <- plot_ly(type='pie', labels=churn_counts$Churn, values=churn_counts$n, 
               textinfo='label+percent',
               insidetextorientation='radial',marker = list(colors = colors,
                      line = list(color = '#FFFFFF', width = 1)))
fig

Creating a subset for churned and retained customers for deep dive analysis and checking the summary of the divided data to analyze the trend.

library(dplyr)


Telecom_Data_yes = filter(Telecom_Data, Churn == "Yes")

Telecom_Data_no = filter(Telecom_Data, Churn == "No")

summary(Telecom_Data_yes)
##    CustomerID         Churn           MonthlyRevenue MonthlyMinutes
##  Min.   :3000002   Length:14711       Min.   :  0    Min.   :   0  
##  1st Qu.:3099298   Class :character   1st Qu.: 33    1st Qu.: 132  
##  Median :3195614   Mode  :character   Median : 48    Median : 330  
##  Mean   :3194322                      Mean   : 58    Mean   : 484  
##  3rd Qu.:3286308                      3rd Qu.: 70    3rd Qu.: 667  
##  Max.   :3399978                      Max.   :861    Max.   :5410  
##                                       NA's   :70     NA's   :70    
##  TotalRecurringCharge DirectorAssistedCalls OverageMinutes  RoamingCalls
##  Min.   :-11          Min.   : 0.0          Min.   :   0   Min.   :  0  
##  1st Qu.: 30          1st Qu.: 0.0          1st Qu.:   0   1st Qu.:  0  
##  Median : 44          Median : 0.2          Median :   4   Median :  0  
##  Mean   : 45          Mean   : 0.8          Mean   :  43   Mean   :  1  
##  3rd Qu.: 55          3rd Qu.: 0.7          3rd Qu.:  46   3rd Qu.:  0  
##  Max.   :338          Max.   :45.8          Max.   :2018   Max.   :851  
##  NA's   :70           NA's   :70            NA's   :70     NA's   :70   
##  PercChangeMinutes PercChangeRevenues  DroppedCalls    BlockedCalls  
##  Min.   :-2868     Min.   :-851       Min.   :  0.0   Min.   :  0.0  
##  1st Qu.: -101     1st Qu.:  -8       1st Qu.:  0.7   1st Qu.:  0.0  
##  Median :  -11     Median :   0       Median :  3.0   Median :  1.0  
##  Mean   :  -25     Mean   :   0       Mean   :  5.8   Mean   :  4.0  
##  3rd Qu.:   54     3rd Qu.:   2       3rd Qu.:  7.3   3rd Qu.:  3.3  
##  Max.   : 5192     Max.   :2484       Max.   :208.7   Max.   :314.7  
##  NA's   :208       NA's   :208                                       
##  UnansweredCalls CustomerCareCalls ThreewayCalls   ReceivedCalls  OutboundCalls
##  Min.   :  0     Min.   :  0.0     Min.   : 0.00   Min.   :   0   Min.   :  0  
##  1st Qu.:  4     1st Qu.:  0.0     1st Qu.: 0.00   1st Qu.:   6   1st Qu.:  2  
##  Median : 15     Median :  0.0     Median : 0.00   Median :  45   Median : 12  
##  Mean   : 26     Mean   :  1.6     Mean   : 0.26   Mean   : 105   Mean   : 24  
##  3rd Qu.: 34     3rd Qu.:  1.3     3rd Qu.: 0.30   3rd Qu.: 140   3rd Qu.: 32  
##  Max.   :849     Max.   :172.3     Max.   :30.00   Max.   :2619   Max.   :520  
##                                                                                
##   InboundCalls   PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls
##  Min.   :  0.0   Min.   :   0   Min.   :   0      Min.   :  0        
##  1st Qu.:  0.0   1st Qu.:  19   1st Qu.:   9      1st Qu.:  2        
##  Median :  1.7   Median :  58   Median :  31      Median :  5        
##  Mean   :  7.3   Mean   :  84   Mean   :  62      Mean   : 10        
##  3rd Qu.:  8.0   3rd Qu.: 114   3rd Qu.:  80      3rd Qu.: 12        
##  Max.   :298.3   Max.   :1359   Max.   :1314      Max.   :329        
##                                                                      
##  CallForwardingCalls CallWaitingCalls MonthsInService   UniqueSubs   
##  Min.   : 0.0        Min.   :  0.0    Min.   : 6      Min.   :  1.0  
##  1st Qu.: 0.0        1st Qu.:  0.0    1st Qu.:12      1st Qu.:  1.0  
##  Median : 0.0        Median :  0.0    Median :17      Median :  1.0  
##  Mean   : 0.0        Mean   :  1.6    Mean   :19      Mean   :  1.6  
##  3rd Qu.: 0.0        3rd Qu.:  1.3    3rd Qu.:24      3rd Qu.:  2.0  
##  Max.   :33.7        Max.   :135.7    Max.   :61      Max.   :196.0  
##                                                                      
##    ActiveSubs   ServiceArea           Handsets     HandsetModels 
##  Min.   : 0.0   Length:14711       Min.   : 1.00   Min.   : 1.0  
##  1st Qu.: 1.0   Class :character   1st Qu.: 1.00   1st Qu.: 1.0  
##  Median : 1.0   Mode  :character   Median : 1.00   Median : 1.0  
##  Mean   : 1.4                      Mean   : 1.74   Mean   : 1.5  
##  3rd Qu.: 2.0                      3rd Qu.: 2.00   3rd Qu.: 2.0  
##  Max.   :53.0                      Max.   :22.00   Max.   :14.0  
##                                                                  
##  CurrentEquipmentDays     AgeHH1         AgeHH2     ChildrenInHH      
##  Min.   :  -4         Min.   : 0.0   Min.   : 0.0   Length:14711      
##  1st Qu.: 249         1st Qu.: 0.0   1st Qu.: 0.0   Class :character  
##  Median : 366         Median :34.0   Median : 0.0   Mode  :character  
##  Mean   : 422         Mean   :30.3   Mean   :20.4                     
##  3rd Qu.: 564         3rd Qu.:48.0   3rd Qu.:42.0                     
##  Max.   :1779         Max.   :98.0   Max.   :99.0                     
##                       NA's   :249    NA's   :249                      
##  HandsetRefurbished HandsetWebCapable   TruckOwner          RVOwner         
##  Length:14711       Length:14711       Length:14711       Length:14711      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Homeownership      BuysViaMailOrder   RespondsToMailOffers OptOutMailings    
##  Length:14711       Length:14711       Length:14711         Length:14711      
##  Class :character   Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character   Mode  :character     Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##  NonUSTravel        OwnsComputer       HasCreditCard      RetentionCalls
##  Length:14711       Length:14711       Length:14711       Min.   :0.00  
##  Class :character   Class :character   Class :character   1st Qu.:0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.00  
##                                                           Mean   :0.06  
##                                                           3rd Qu.:0.00  
##                                                           Max.   :4.00  
##                                                                         
##  RetentionOffersAccepted NewCellphoneUser   NotNewCellphoneUser
##  Min.   :0.000           Length:14711       Length:14711       
##  1st Qu.:0.000           Class :character   Class :character   
##  Median :0.000           Mode  :character   Mode  :character   
##  Mean   :0.026                                                 
##  3rd Qu.:0.000                                                 
##  Max.   :3.000                                                 
##                                                                
##  ReferralsMadeBySubscriber  IncomeGroup   OwnsMotorcycle    
##  Min.   :0.00              Min.   :0.00   Length:14711      
##  1st Qu.:0.00              1st Qu.:0.00   Class :character  
##  Median :0.00              Median :5.00   Mode  :character  
##  Mean   :0.05              Mean   :4.26                     
##  3rd Qu.:0.00              3rd Qu.:7.00                     
##  Max.   :9.00              Max.   :9.00                     
##                                                             
##  AdjustmentsToCreditRating HandsetPrice       MadeCallToRetentionTeam
##  Min.   :0.00              Length:14711       Length:14711           
##  1st Qu.:0.00              Class :character   Class :character       
##  Median :0.00              Mode  :character   Mode  :character       
##  Mean   :0.04                                                        
##  3rd Qu.:0.00                                                        
##  Max.   :9.00                                                        
##                                                                      
##     CreditRating   PrizmCode                Occupation    MaritalStatus     
##  1-Highest:2628   Length:14711       Other       :10932   Length:14711      
##  2-High   :5712   Class :character   Professional: 2467   Class :character  
##  3-Good   :2608   Mode  :character   Crafts      :  426   Mode  :character  
##  4-Medium :1399                      Clerical    :  289                     
##  5-Low    :1436                      Self        :  243                     
##  6-VeryLow: 316                      Retired     :  185                     
##  7-Lowest : 612                      (Other)     :  169                     
##  perc_recurrent_charge perc_overage_minute
##  Min.   :-30           Min.   :  0        
##  1st Qu.: 69           1st Qu.:  0        
##  Median : 94           Median :  1        
##  Mean   : 88           Mean   :  7        
##  3rd Qu.:106           3rd Qu.: 10        
##  Max.   :514           Max.   :100        
##  NA's   :72            NA's   :512
summary(Telecom_Data_no)
##    CustomerID         Churn           MonthlyRevenue MonthlyMinutes
##  Min.   :3000014   Length:36336       Min.   :  -6   Min.   :   0  
##  1st Qu.:3101025   Class :character   1st Qu.:  34   1st Qu.: 170  
##  Median :3204388   Mode  :character   Median :  49   Median : 381  
##  Mean   :3205048                      Mean   :  59   Mean   : 543  
##  3rd Qu.:3313601                      3rd Qu.:  72   3rd Qu.: 743  
##  Max.   :3399994                      Max.   :1223   Max.   :7359  
##                                       NA's   :86     NA's   :86    
##  TotalRecurringCharge DirectorAssistedCalls OverageMinutes  RoamingCalls 
##  Min.   : -9          Min.   :  0.0         Min.   :   0   Min.   :   0  
##  1st Qu.: 30          1st Qu.:  0.0         1st Qu.:   0   1st Qu.:   0  
##  Median : 45          Median :  0.2         Median :   2   Median :   0  
##  Mean   : 48          Mean   :  0.9         Mean   :  39   Mean   :   1  
##  3rd Qu.: 60          3rd Qu.:  1.0         3rd Qu.:  39   3rd Qu.:   0  
##  Max.   :400          Max.   :159.4         Max.   :4321   Max.   :1112  
##  NA's   :86           NA's   :86            NA's   :86     NA's   :86    
##  PercChangeMinutes PercChangeRevenues  DroppedCalls    BlockedCalls
##  Min.   :-3875     Min.   :-1108      Min.   :  0.0   Min.   :  0  
##  1st Qu.:  -78     1st Qu.:   -7      1st Qu.:  1.0   1st Qu.:  0  
##  Median :   -3     Median :    0      Median :  3.0   Median :  1  
##  Mean   :   -6     Mean   :   -1      Mean   :  6.1   Mean   :  4  
##  3rd Qu.:   70     3rd Qu.:    2      3rd Qu.:  7.7   3rd Qu.:  4  
##  Max.   : 4480     Max.   : 1347      Max.   :221.7   Max.   :384  
##  NA's   :159       NA's   :159                                     
##  UnansweredCalls CustomerCareCalls ThreewayCalls  ReceivedCalls  OutboundCalls
##  Min.   :  0     Min.   :  0       Min.   : 0.0   Min.   :   0   Min.   :  0  
##  1st Qu.:  6     1st Qu.:  0       1st Qu.: 0.0   1st Qu.:  10   1st Qu.:  4  
##  Median : 17     Median :  0       Median : 0.0   Median :  56   Median : 14  
##  Mean   : 29     Mean   :  2       Mean   : 0.3   Mean   : 119   Mean   : 26  
##  3rd Qu.: 37     3rd Qu.:  2       3rd Qu.: 0.3   3rd Qu.: 159   3rd Qu.: 35  
##  Max.   :840     Max.   :327       Max.   :66.0   Max.   :2692   Max.   :644  
##                                                                               
##   InboundCalls PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls
##  Min.   :  0   Min.   :   0   Min.   :   0      Min.   :  0        
##  1st Qu.:  0   1st Qu.:  25   1st Qu.:  12      1st Qu.:  2        
##  Median :  2   Median :  64   Median :  38      Median :  6        
##  Mean   :  9   Mean   :  93   Mean   :  70      Mean   : 10        
##  3rd Qu.: 10   3rd Qu.: 124   3rd Qu.:  92      3rd Qu.: 13        
##  Max.   :519   Max.   :2091   Max.   :1475      Max.   :412        
##                                                                    
##  CallForwardingCalls CallWaitingCalls MonthsInService   UniqueSubs   
##  Min.   : 0.0        Min.   :  0.0    Min.   : 6.0    Min.   : 1.00  
##  1st Qu.: 0.0        1st Qu.:  0.0    1st Qu.:11.0    1st Qu.: 1.00  
##  Median : 0.0        Median :  0.3    Median :16.0    Median : 1.00  
##  Mean   : 0.0        Mean   :  1.9    Mean   :18.6    Mean   : 1.51  
##  3rd Qu.: 0.0        3rd Qu.:  1.7    3rd Qu.:24.0    3rd Qu.: 2.00  
##  Max.   :81.3        Max.   :212.7    Max.   :60.0    Max.   :12.00  
##                                                                      
##    ActiveSubs    ServiceArea           Handsets     HandsetModels  
##  Min.   : 0.00   Length:36336       Min.   : 1.00   Min.   : 1.00  
##  1st Qu.: 1.00   Class :character   1st Qu.: 1.00   1st Qu.: 1.00  
##  Median : 1.00   Mode  :character   Median : 1.00   Median : 1.00  
##  Mean   : 1.35                      Mean   : 1.83   Mean   : 1.58  
##  3rd Qu.: 2.00                      3rd Qu.: 2.00   3rd Qu.: 2.00  
##  Max.   :11.00                      Max.   :24.00   Max.   :15.00  
##                                     NA's   :1       NA's   :1      
##  CurrentEquipmentDays     AgeHH1        AgeHH2    ChildrenInHH      
##  Min.   :  -5         Min.   : 0    Min.   : 0    Length:36336      
##  1st Qu.: 197         1st Qu.: 0    1st Qu.: 0    Class :character  
##  Median : 310         Median :36    Median : 0    Mode  :character  
##  Mean   : 364         Mean   :32    Mean   :21                      
##  3rd Qu.: 493         3rd Qu.:48    3rd Qu.:44                      
##  Max.   :1812         Max.   :99    Max.   :98                      
##  NA's   :1            NA's   :660   NA's   :660                     
##  HandsetRefurbished HandsetWebCapable   TruckOwner          RVOwner         
##  Length:36336       Length:36336       Length:36336       Length:36336      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Homeownership      BuysViaMailOrder   RespondsToMailOffers OptOutMailings    
##  Length:36336       Length:36336       Length:36336         Length:36336      
##  Class :character   Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character   Mode  :character     Mode  :character  
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##  NonUSTravel        OwnsComputer       HasCreditCard      RetentionCalls 
##  Length:36336       Length:36336       Length:36336       Min.   :0.000  
##  Class :character   Class :character   Class :character   1st Qu.:0.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.000  
##                                                           Mean   :0.029  
##                                                           3rd Qu.:0.000  
##                                                           Max.   :3.000  
##                                                                          
##  RetentionOffersAccepted NewCellphoneUser   NotNewCellphoneUser
##  Min.   :0.000           Length:36336       Length:36336       
##  1st Qu.:0.000           Class :character   Class :character   
##  Median :0.000           Mode  :character   Mode  :character   
##  Mean   :0.015                                                 
##  3rd Qu.:0.000                                                 
##  Max.   :3.000                                                 
##                                                                
##  ReferralsMadeBySubscriber  IncomeGroup   OwnsMotorcycle    
##  Min.   : 0.0              Min.   :0.00   Length:36336      
##  1st Qu.: 0.0              1st Qu.:1.00   Class :character  
##  Median : 0.0              Median :5.00   Mode  :character  
##  Mean   : 0.1              Mean   :4.35                     
##  3rd Qu.: 0.0              3rd Qu.:7.00                     
##  Max.   :35.0              Max.   :9.00                     
##                                                             
##  AdjustmentsToCreditRating HandsetPrice       MadeCallToRetentionTeam
##  Min.   : 0.00             Length:36336       Length:36336           
##  1st Qu.: 0.00             Class :character   Class :character       
##  Median : 0.00             Mode  :character   Mode  :character       
##  Mean   : 0.06                                                       
##  3rd Qu.: 0.00                                                       
##  Max.   :25.00                                                       
##                                                                      
##     CreditRating    PrizmCode                Occupation    MaritalStatus     
##  1-Highest: 5894   Length:36336       Other       :26705   Length:36336      
##  2-High   :13281   Class :character   Professional: 6288   Class :character  
##  3-Good   : 5802   Mode  :character   Crafts      : 1093   Mode  :character  
##  4-Medium : 3958                      Clerical    :  697                     
##  5-Low    : 5063                      Self        :  636                     
##  6-VeryLow:  836                      Retired     :  548                     
##  7-Lowest : 1502                      (Other)     :  369                     
##  perc_recurrent_charge perc_overage_minute
##  Min.   :-28.1         Min.   :  0        
##  1st Qu.: 74.0         1st Qu.:  0        
##  Median : 96.1         Median :  1        
##  Mean   :  Inf         Mean   :  6        
##  3rd Qu.:110.4         3rd Qu.:  8        
##  Max.   :  Inf         Max.   :100        
##  NA's   :89            NA's   :367

Showing the classification of our data into major categories.

feat_typ_counts <- data.frame(read.csv("Feat_type_counts.csv"))
#install.packages("plotrix")
library(plotrix)

library("ggplot2")
#pie(feat_typ_counts$Counts, feat_typ_counts$Variable.Type)

piepercent<- round(100 * feat_typ_counts$Counts / sum(feat_typ_counts$Counts), 1)


feat_typ_counts$fraction <- feat_typ_counts$Counts / sum(feat_typ_counts$Counts)

# Compute the cumulative percentages (top of each rectangle)
feat_typ_counts$ymax <- cumsum(feat_typ_counts$fraction)

# Compute the bottom of each rectangle
feat_typ_counts$ymin <- c(0, head(feat_typ_counts$ymax, n=-1))

# Compute label position
feat_typ_counts$labelPosition <- (feat_typ_counts$ymax + feat_typ_counts$ymin) / 2

# Compute a good label
feat_typ_counts$label <- paste0(feat_typ_counts$Variable.Type, "\n Count: ", feat_typ_counts$Counts)


ggplot(feat_typ_counts, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Variable.Type)) +
  geom_rect() +
  geom_label( x=3.5, aes(y=labelPosition, label=label), size=2) +
  scale_fill_brewer(palette=4) +
  coord_polar(theta="y") +
  xlim(c(2, 4)) +
  theme_void() +
  theme(legend.position = "none")

##Plotting a Box plot of Monthly Minutes

boxplot(Telecom_Data$MonthlyMinutes,
main = "Monthly Minutes of Customers",
xlab = "Monthly Min",
ylab = "Frequency",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)

##Current Headset use in days

plot_ly(Telecom_Data, y= Telecom_Data$CurrentEquipmentDays, color = Telecom_Data$Churn, type = "box") %>% 
         layout(boxmode = "group", 
         xaxis = list(title=''), 
         yaxis = list(title='Frequency'))

##Boxplot of Total Recurring Charge

plot_ly(Telecom_Data, y= Telecom_Data$TotalRecurringCharge, color = Telecom_Data$Churn, type = "box") %>% 
         layout(boxmode = "group", 
         xaxis = list(title=''), 
         yaxis = list(title='Frequency'))

##Box plot of Month in Service

plot_ly(Telecom_Data, y= Telecom_Data$MonthsInService, color = Telecom_Data$Churn, type = "box") %>% 
         layout(boxmode = "group", 
         xaxis = list(title=''), 
         yaxis = list(title='Frequency'))

##Box plot of the Percent change in recurrent charge

plot_ly(Telecom_Data, y= Telecom_Data$perc_recurrent_charge, color = Telecom_Data$Churn, type = "box") %>% 
         layout(boxmode = "group", 
         xaxis = list(title=''), 
         yaxis = list(title='Frequency'))

##Box plot of Percent change in Minutes

plot_ly(Telecom_Data, y= Telecom_Data$PercChangeMinutes, color = Telecom_Data$Churn, type = "box") %>% 
         layout(boxmode = "group", 
         xaxis = list(title=''), 
         yaxis = list(title='Frequency'))

##Box plot of Percent change in Revenues

plot_ly(Telecom_Data, y= Telecom_Data$PercChangeRevenues, color = Telecom_Data$Churn, type = "box") %>% 
         layout(boxmode = "group", 
         xaxis = list(title=''), 
         yaxis = list(title='Frequency'))

##Distribution of the Montly Revenue

library(ggplot2)  
library(plotly)

set.seed(1)    

gg <- ggplot(Telecom_Data,aes(x = MonthlyRevenue, color = 'density')) +  
  geom_histogram(aes(y = ..density..), bins = 7,  fill = '#67B7D1', alpha = 0.5) +  
  geom_density(color = '#67B7D1') +  
  geom_rug(color = '#67B7D1') + 
  ylab("") + 
  xlab("")  + theme(legend.title=element_blank()) +
  scale_color_manual(values = c('density' = '#67B7D1'))


ggplotly(gg)%>% 
  layout(plot_bgcolor='#e5ecf6',   
             xaxis = list(   
               title='Time', 
               zerolinecolor = '#ffff',   
               zerolinewidth = 2,   
               gridcolor = 'ffff'),   
             yaxis = list(   
               title='Monthly Revenue', 
               zerolinecolor = '#ffff',   
               zerolinewidth = 2,   
               gridcolor = 'ffff')) 

##Distribution of Monthly Minutes

library(ggplot2)  
library(plotly)

set.seed(1)    

gg <- ggplot(Telecom_Data,aes(x = MonthlyMinutes, color = 'density')) +  
  geom_histogram(aes(y = ..density..), bins = 7,  fill = '#67B7D1', alpha = 0.5) +  
  geom_density(color = '#67B7D1') +  
  geom_rug(color = '#67B7D1') + 
  ylab("") + 
  xlab("")  + theme(legend.title=element_blank()) +
  scale_color_manual(values = c('density' = '#67B7D1'))


ggplotly(gg)%>% 
  layout(plot_bgcolor='#e5ecf6',   
             xaxis = list(   
               title='Monthly Minutes ', 
               zerolinecolor = '#ffff',   
               zerolinewidth = 2,   
               gridcolor = 'ffff'),   
             yaxis = list(   
               title='Frequency', 
               zerolinecolor = '#ffff',   
               zerolinewidth = 2,   
               gridcolor = 'ffff')) 
qqnorm(Telecom_Data$MonthlyMinutes)                        # QQplot 


qqline(Telecom_Data$MonthlyMinutes, col = "red") 

#install.packages("car")
#library("car")
#qqPlot(Telecom_Data$MonthlyMinutes)
library("plotly")
#plot_ly(Telecom_Data, y= Telecom_Data$AgeHH1, color = Telecom_Data$Churn, type = "box") 
         #layout(boxmode = "group", 
        # xaxis = list(title=''), 
        # yaxis = list(title='Frequency'))

Checking how Non US travel affects the churn rate

churn_count<-nrow(Telecom_Data$Churn)
ggplot(Telecom_Data,aes(x = NonUSTravel,fill=Churn )) +
geom_bar( position = "stack")+ggtitle("How Travel affects churn")

Do the number of dropped call have affect on churn ?

ggplot(Telecom_Data, aes(x=DroppedCalls, fill=Churn)) + geom_histogram(position='identity',alpha=0.6)

Income group of the customers

ggplot(Telecom_Data,aes(x=IncomeGroup, fill=Churn))+geom_histogram(position='identity',alpha=0.6)

How many customer are opting out of mailing list

ggplot(Telecom_Data,aes(x=OptOutMailings,fill=Churn))+geom_bar(position='identity',alpha=0.6)

Histogram for representing age of customers in Telecom Data

library(plotly)
ggplot(Telecom_Data, aes(x=AgeHH1))+ geom_histogram(color="aquamarine4",fill = "aquamarine3",alpha=0.6, bins=30) + labs(x="Age of Customers", y="Frequency", 
title="Histogram of Customer Age") +  theme_classic()

Filtering age of primary users

library(dplyr)
AgeFiltered = filter(Telecom_Data, AgeHH1== 0)
nrow(AgeFiltered)
## [1] 13917
(13917/nrow(Telecom_Data))*100
## [1] 27.3

Boxplot representing customer age group in Telecom Data using ggplot

library(ggplot2)
ggplot(Telecom_Data, aes(y=AgeHH1)) + geom_boxplot( colour="maroon", fill="aquamarine3",alpha=0.6) + ggtitle("Boxplot of Customer Age group`") + labs(x="Age of Customers", y=" Frequency") +  theme_classic()

Boxplot representing Customer age group in Telecom Data using plotly

library(plotly)
plot_ly(Telecom_Data, y= Telecom_Data$AgeHH1,type = "box", color = Telecom_Data$Churn) %>% 
         layout(boxmode = "group", 
         xaxis = list(title=''), 
         yaxis = list(title='Frequency'))

0.1 Does credit rating have an impact on churn rate?

Boxplot for Credit Rating using ggplot

library(ggplot2)
ggplot(Telecom_Data, aes(y=CreditRating)) + geom_boxplot( colour="orange", fill="black") + ggtitle("Credit Rating  using `ggplot`")

Creating a subset for Churned and Retained customers data

Churned <- subset(Telecom_Data, Churn=="Yes")
Retained <- subset(Telecom_Data, Churn=="No")

Frequency barplot for Credit Rating of Churned customers using ggplot

library(ggplot2)
ggplot(Churned, aes(x = CreditRating)) + geom_bar(col="black", fill="red", alpha=0.4) + ggtitle("Credit Rating for Churned Telecom Data") + labs(x="Credit Rating (x-axis)", y=" Count (y-axis)") + ylim(0,15000) + theme_classic()

Frequency barplot for Credit Rating of Retained customers using ggplot

library(ggplot2)
ggplot(Retained, aes(x = CreditRating)) + geom_bar(col="black", fill="aquamarine3", alpha=0.6) + ggtitle("Credit Rating for Retained Telecom Data") +  labs(x="Credit Rating (x-axis)", y=" Count (y-axis)")+ ylim(0,15000) +  theme_classic()

0.2 At what duration is the churn rate high for the customers?

# Histogram for relationship between months in service and Churn
ggplot(Churned, aes(x=MonthsInService, fill=Churn)) + geom_histogram(position='identity',alpha=0.6,color='aquamarine4',fill='aquamarine3')+xlab("Service period for churned customers (In Months) ")+ylab("Frequency") + theme_classic()+ggtitle("Service Months Distribution for Churned customers")

Mean_MonthsInService=mean(Churned$MonthsInService)
print(paste("Mean of service months of the customer:",Mean_MonthsInService))
## [1] "Mean of service months of the customer: 19.0443205764394"
Median_MonthsInService=median(Churned$MonthsInService)
print(paste("Median of service months of the customer:",Median_MonthsInService))
## [1] "Median of service months of the customer: 17"

0.3 In what Prizm codes are the Churn Rates high?

# Barplot for Prizm Code effect on Churn
ggplot(Telecom_Data, aes(x=PrizmCode, fill = Churn)) +geom_bar(position = "dodge2")+ggtitle("Churn distribution for Prizm code")

#install.packages("plotly")
library(plotly)
colors <- c('rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)')


fig <- plot_ly(type='pie', labels=Churned$PrizmCode, values=Churned$n,
               textinfo='label+percent',
               insidetextorientation='radial',marker = list(colors = "Set1"),
                      line = list(color = '#FFFFFF', width = 1))
fig
fig_1 <- plot_ly(type='pie', labels=Retained$PrizmCode, values=Retained$n,
               textinfo='label+percent',
               insidetextorientation='radial',marker = list(colors = "Set1"),
                      line = list(color = '#FFFFFF', width = 1))
fig_1

0.4 Is occupation independent of churn ?

#Frequency distribution of Occupation

ggplot(Telecom_Data,aes(x=Occupation)) + geom_bar(fill = "aquamarine3") + ggtitle("Frequency distribution of occupation") 

#Creating a contingency table for Occupation and Churn
Occupation_Churn<-table(Telecom_Data$Occupation,Telecom_Data$Churn)
str(Occupation_Churn)
##  'table' int [1:8, 1:2] 697 1093 106 26705 6288 548 636 263 289 426 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:8] "Clerical" "Crafts" "Homemaker" "Other" ...
##   ..$ : chr [1:2] "No" "Yes"
#Performing Chi Square Test to check if occupation is independent of churn

chisq_test=chisq.test(Occupation_Churn)
chisq_test
## 
##  Pearson's Chi-squared test
## 
## data:  Occupation_Churn
## X-squared = 10, df = 7, p-value = 0.2
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.171354265327925"

p value is greater than 0.05. Hence we will be accepting the null hypothesis, H0. Therefore we can say that occupation is independent of churn.

0.5 Does the monthly revenue average differ for different occupations?

H0: Average monthly revenue is similar across different occupations

H1: Average monthly revenue is different across different occupations

## Anova Results 
one.way <- aov(MonthlyMinutes ~ Occupation, data = Telecom_Data)
#summary(one.way)
xkabledply(one.way, title = "ANOVA result summary")

P values < 0.05, Which means null hypothesis is rejected

Monthly revenue average differ for occupations

Bivariate analysis of variables

Telecom_Data <- read.csv("Telecom Data.csv")

Correlation Analysis of Monthly Revenue and Overage Minutes for churned customers.

churndata <- subset(Telecom_Data, Telecom_Data$Churn == "Yes")
sum(is.na(churndata$MonthlyRevenue))
## [1] 70
sum(is.na(churndata$OverageMinutes))
## [1] 70

We can see there are 70 NA values in monthly revenue and overageminutes columns when churn is yes Lets remove those NA values

churndata2 <- na.omit(churndata)
sum(is.na(churndata2$MonthlyRevenue))
## [1] 0
sum(is.na(churndata2$OverageMinutes))
## [1] 0

We have removed all the null values.

cor.test(churndata2$MonthlyRevenue, churndata2$OverageMinutes, method='spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  churndata2$MonthlyRevenue and churndata2$OverageMinutes
## S = 2e+11, p-value <2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##   rho 
## 0.591

The output is 0.5911 which is approximately 0.6. As the sign is positive, we can say that monthly revenue and overage minutes vary postively when there is churn. That is, the as the overage minutes increases, monthly revenue also increases.

#Plot with statistical results
library(ggplot2)
ggplot(data = churndata2) + 
  geom_smooth(mapping = aes(x = OverageMinutes, y = MonthlyRevenue, color="Brown")) + theme_classic()+
xlab("Overage minutes used by the customer") + ylab("Monthly revenue of the Telecom company ") 

From the graph we can see that as Overage Minutes increased, Monthly revenue also increased.

#install.packages("ggpubr")
library(ggpubr)
ggscatter(data = churndata2, x = "OverageMinutes", y = "MonthlyRevenue",
          conf.int = TRUE, color="brown", xlab="Overage Minutes used by customer", ylab="Monthly Revenue of the Telecom company", title="Scatter plot of Overage Minutes vs Monthly Revenue for churned customers")

The scatter graph in the aforementioned section illustrates the positive correlation between customer overage minutes consumed and the telecom sector’s monthly income for customers who churn.

Now lets plot when there is no churn, that is when there is retention of customers.

retentiondata <- subset(Telecom_Data, Telecom_Data$Churn == "No")
sum(is.na(retentiondata$MonthlyRevenue))
## [1] 86
sum(is.na(retentiondata$OverageMinutes))
## [1] 86

We can see there are 86 NA values in monthly revenue and overageminutes columns when there is no churn. Lets remove those NA values

retentiondata2 <- na.omit(retentiondata)
sum(is.na(retentiondata2$MonthlyRevenue))
## [1] 0
sum(is.na(retentiondata2$OverageMinutes))
## [1] 0

We have removed all the null values. Now lets do correlation

cor.test(retentiondata2$MonthlyRevenue, retentiondata2$OverageMinutes, method='spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  retentiondata2$MonthlyRevenue and retentiondata2$OverageMinutes
## S = 3e+12, p-value <2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##   rho 
## 0.564

Even when there is retention the output is 0.56, we can say the that Monthly revenue varies postively as overage minutes varies.

#Plot with statistical results

library(ggplot2)
ggplot(data = retentiondata2)+
  geom_smooth(mapping = aes(x = OverageMinutes, y = MonthlyRevenue), color="dark green" ) + theme_classic() + xlab("Overage minutes used by the customer") + ylab("Monthly revenue of the Telecom company ")

From the graph we can see that as Overage Minutes increases, Monthly revenue also increases.

#install.packages("ggpubr")
library(ggpubr)
ggscatter(data = retentiondata2, x = "OverageMinutes", y = "MonthlyRevenue",
          conf.int = TRUE, color="dark green", xlab="Overage Minutes used by customer", ylab="Monthly Revenue of the Telecom company", title="Scatter plot of Overage Minutes vs Monthly Revenue for retention customers")

The graph shows that even for customers who are still with the company, monthly revenue rises as Overage Minutes rise. Also, we can observe that outliers are more for churned customers compared to non-churn customers

datacorr <- Telecom_Data[ , c("MonthlyRevenue","MonthlyMinutes", "TotalRecurringCharge","CustomerCareCalls","ThreewayCalls","ReceivedCalls","OutboundCalls","MonthsInService","HandsetPrice","CreditRating")]   
sum(is.na(datacorr))
## [1] 468

There are 468 NA values in the subsetted dataset.

datacorr2 <- na.omit(datacorr)
nrow(datacorr2)
## [1] 50891
sum(is.na(datacorr2))
## [1] 0

After removing the NA values, we are left out with 50891 rows.

Next, we have used few numerical factors, including months of service, outbound calls, received calls, three-way calls, customer care calls, total recurring calls, and monthly minutes, and we’ve conducted correlation statistical studies to look at how they relate to one another using correlation matrix.

# load package
#install.packages("ggstatsplot")
#install.packages("ggcorrplot")

library(ggstatsplot)
library(ggcorrplot)
library(corrplot)

# correlogram
ggstatsplot::ggcorrmat(
data = datacorr2,
type = "nonparametric", # parametric for Pearson, nonparametric for Spearman's correlation
colors = c("darkred", "white", "steelblue"), 
title = "Correlation matrix"
)

The above correlation graph leads us to the following conclusions:

  1. Monthly minutes consumed by the client and monthly revenue of the telecom operator are highly associated.
  2. There is a significant probability that customers will receive calls from customer service.
  3. The number of months a consumer stays with a service won’t significantly alter its monthly revenue.
  4. The customer service calls made by the telecom sector have no impact on the number of months the clients have left on their subscriptions.

0.6 Data Preprocessing for Modeling

In the EDA, We observed Incorrect entries in agehh1 and agehh2 which had zero values that need to be imputed, We observed the distribution of it and decided to go with median imputation as age has skewness.

library(ggplot2)
ggplot(Telecom_Data, aes(x = AgeHH1)) + 
  geom_histogram(aes(y = ..density..),
                 colour = 1, fill = "cyan")

ggplot(Telecom_Data, aes(x = AgeHH2)) + 
  geom_histogram(aes(y = ..density..),
                 colour = 1, fill = "cyan")

Telecom_Data$AgeHH1<-replace(Telecom_Data$AgeHH1,Telecom_Data$AgeHH1 <1 ,NA)
Telecom_Data$AgeHH2<-replace(Telecom_Data$AgeHH2,Telecom_Data$AgeHH2 <1 ,NA)
Telecom_Data$AgeHH1[is.na(Telecom_Data$AgeHH1)]<- median(Telecom_Data$AgeHH1,na.rm = TRUE)

Telecom_Data$AgeHH2[is.na(Telecom_Data$AgeHH2)]<- median(Telecom_Data$AgeHH2,na.rm = TRUE)

library(ggplot2)
ggplot(Telecom_Data, aes(x = AgeHH1)) + 
  geom_histogram(aes(y = ..density..),
                 colour = 1, fill = "cyan")

ggplot(Telecom_Data, aes(x = AgeHH2)) + 
  geom_histogram(aes(y = ..density..),
                 colour = 1, fill = "cyan")

0.7 checking inactive customers and removing them

library(plotly)


fig <- plot_ly(y = Telecom_Data$MonthlyMinutes, type = "box", quartilemethod="linear") # or "inclusive", or "linear" by default

fig
## checking inactive customers 
nrow(subset(Telecom_Data, MonthlyRevenue <= 0))
## [1] 9
## checking inactive customers 
nrow(subset(Telecom_Data, MonthlyMinutes <= 0))
## [1] 723
## Removing inactive customers (outliers)

Telecom_Data<-subset(Telecom_Data, MonthlyRevenue >  0)

Telecom_Data <-subset(Telecom_Data, MonthlyMinutes > 0)

library(plotly)
fig <- plot_ly(y = Telecom_Data$MonthlyMinutes, type = "box", quartilemethod="exclusive") # or "inclusive", or "linear" by default

fig
nrow(Telecom_Data)
## [1] 50162

0.8 Checking Null Values :

library(tidyverse)
map(Telecom_Data, ~sum(is.na(.)))
## $CustomerID
## [1] 0
## 
## $Churn
## [1] 0
## 
## $MonthlyRevenue
## [1] 0
## 
## $MonthlyMinutes
## [1] 0
## 
## $TotalRecurringCharge
## [1] 0
## 
## $DirectorAssistedCalls
## [1] 0
## 
## $OverageMinutes
## [1] 0
## 
## $RoamingCalls
## [1] 0
## 
## $PercChangeMinutes
## [1] 173
## 
## $PercChangeRevenues
## [1] 173
## 
## $DroppedCalls
## [1] 0
## 
## $BlockedCalls
## [1] 0
## 
## $UnansweredCalls
## [1] 0
## 
## $CustomerCareCalls
## [1] 0
## 
## $ThreewayCalls
## [1] 0
## 
## $ReceivedCalls
## [1] 0
## 
## $OutboundCalls
## [1] 0
## 
## $InboundCalls
## [1] 0
## 
## $PeakCallsInOut
## [1] 0
## 
## $OffPeakCallsInOut
## [1] 0
## 
## $DroppedBlockedCalls
## [1] 0
## 
## $CallForwardingCalls
## [1] 0
## 
## $CallWaitingCalls
## [1] 0
## 
## $MonthsInService
## [1] 0
## 
## $UniqueSubs
## [1] 0
## 
## $ActiveSubs
## [1] 0
## 
## $ServiceArea
## [1] 0
## 
## $Handsets
## [1] 0
## 
## $HandsetModels
## [1] 0
## 
## $CurrentEquipmentDays
## [1] 0
## 
## $AgeHH1
## [1] 0
## 
## $AgeHH2
## [1] 0
## 
## $ChildrenInHH
## [1] 0
## 
## $HandsetRefurbished
## [1] 0
## 
## $HandsetWebCapable
## [1] 0
## 
## $TruckOwner
## [1] 0
## 
## $RVOwner
## [1] 0
## 
## $Homeownership
## [1] 0
## 
## $BuysViaMailOrder
## [1] 0
## 
## $RespondsToMailOffers
## [1] 0
## 
## $OptOutMailings
## [1] 0
## 
## $NonUSTravel
## [1] 0
## 
## $OwnsComputer
## [1] 0
## 
## $HasCreditCard
## [1] 0
## 
## $RetentionCalls
## [1] 0
## 
## $RetentionOffersAccepted
## [1] 0
## 
## $NewCellphoneUser
## [1] 0
## 
## $NotNewCellphoneUser
## [1] 0
## 
## $ReferralsMadeBySubscriber
## [1] 0
## 
## $IncomeGroup
## [1] 0
## 
## $OwnsMotorcycle
## [1] 0
## 
## $AdjustmentsToCreditRating
## [1] 0
## 
## $HandsetPrice
## [1] 0
## 
## $MadeCallToRetentionTeam
## [1] 0
## 
## $CreditRating
## [1] 0
## 
## $PrizmCode
## [1] 0
## 
## $Occupation
## [1] 0
## 
## $MaritalStatus
## [1] 0

0.8.1 Feature selection

0.8.1.1 Using Chi square test to select categorical variables

#Creating a contingency table for IncomeGroup and Churn

IncomeGroup_Churn<-table(Telecom_Data$IncomeGroup,Telecom_Data$Churn)
str(IncomeGroup_Churn)
##  'table' int [1:10, 1:2] 8863 1429 841 2189 2893 3022 6805 4083 1880 3960 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:10] "0" "1" "2" "3" ...
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(IncomeGroup_Churn)
chisq_test
## 
##  Pearson's Chi-squared test
## 
## data:  IncomeGroup_Churn
## X-squared = 29, df = 9, p-value = 6e-04
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.000566210665098686"

#Creating a contingency table for Service Area and Churn

Service_Area_Churn<-table(Telecom_Data$ServiceArea,Telecom_Data$Churn)
str(Service_Area_Churn)
##  'table' int [1:748, 1:2] 12 3 10 26 27 14 2 52 55 2 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:748] "" "AIRAIK803" "AIRAND864" "AIRASH828" ...
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(Service_Area_Churn)
chisq_test
## 
##  Pearson's Chi-squared test
## 
## data:  Service_Area_Churn
## X-squared = 957, df = 747, p-value = 3e-07
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 2.60818175962372e-07"

#Creating a contingency table for ChildrenInHH and Churn

ChildrenInHH_Churn<-table(Telecom_Data$ChildrenInHH,Telecom_Data$Churn)
str(ChildrenInHH_Churn)
##  'table' int [1:2, 1:2] 27327 8638 10649 3548
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(ChildrenInHH_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  ChildrenInHH_Churn
## X-squared = 5, df = 1, p-value = 0.02
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.0227012322219358"

#Creating a contingency table for HandsetWebCapable and Churn

HandsetWebCapable_Churn<-table(Telecom_Data$HandsetWebCapable,Telecom_Data$Churn)
str(HandsetWebCapable_Churn)
##  'table' int [1:2, 1:2] 3047 32918 1755 12442
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(HandsetWebCapable_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  HandsetWebCapable_Churn
## X-squared = 177, df = 1, p-value <2e-16
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 1.74606069444535e-40"

#Creating a contingency table for TruckOwner and Churn

TruckOwner_Churn<-table(Telecom_Data$TruckOwner,Telecom_Data$Churn)
str(TruckOwner_Churn)
##  'table' int [1:2, 1:2] 29187 6778 11583 2614
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(TruckOwner_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  TruckOwner_Churn
## X-squared = 1, df = 1, p-value = 0.3
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.267377510347789"

#Creating a contingency table for RVOwner and Churn

RVOwner_Churn<-table(Telecom_Data$RVOwner,Telecom_Data$Churn)
str(RVOwner_Churn)
##  'table' int [1:2, 1:2] 33004 2961 13061 1136
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(RVOwner_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  RVOwner_Churn
## X-squared = 0.7, df = 1, p-value = 0.4
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.404258410869843"

#Creating a contingency table for Homeownership and Churn

Homeownership_Churn<-table(Telecom_Data$Homeownership,Telecom_Data$Churn)
str(Homeownership_Churn)
##  'table' int [1:2, 1:2] 24086 11879 9316 4881
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "Known" "Unknown"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(Homeownership_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  Homeownership_Churn
## X-squared = 8, df = 1, p-value = 0.004
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.00398194844044722"

#Creating a contingency table for BuysViaMailOrder and Churn

BuysViaMailOrder_Churn<-table(Telecom_Data$BuysViaMailOrder,Telecom_Data$Churn)
str(BuysViaMailOrder_Churn)
##  'table' int [1:2, 1:2] 22732 13233 9306 4891
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(BuysViaMailOrder_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  BuysViaMailOrder_Churn
## X-squared = 24, df = 1, p-value = 9e-07
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 9.06780549992831e-07"

#Creating a contingency table for RespondsToMailOffers and Churn

RespondsToMailOffers_Churn<-table(Telecom_Data$RespondsToMailOffers,Telecom_Data$Churn)
str(RespondsToMailOffers_Churn)
##  'table' int [1:2, 1:2] 22172 13793 9105 5092
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(RespondsToMailOffers_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  RespondsToMailOffers_Churn
## X-squared = 27, df = 1, p-value = 2e-07
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 2.42683844479501e-07"

#Creating a contingency table for OptOutMailings and Churn

OptOutMailings_Churn<-table(Telecom_Data$OptOutMailings,Telecom_Data$Churn)
str(OptOutMailings_Churn)
##  'table' int [1:2, 1:2] 35441 524 13988 209
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(OptOutMailings_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  OptOutMailings_Churn
## X-squared = 0.007, df = 1, p-value = 0.9
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.931270932901643"

#Creating a contingency table for NonUSTravel and Churn

NonUSTravel_Churn<-table(Telecom_Data$NonUSTravel,Telecom_Data$Churn)
str(NonUSTravel_Churn)
##  'table' int [1:2, 1:2] 33923 2042 13407 790
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(NonUSTravel_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  NonUSTravel_Churn
## X-squared = 0.2, df = 1, p-value = 0.6
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.63599952578223"

#Creating a contingency table for OwnsComputer and Churn

OwnsComputer_Churn<-table(Telecom_Data$OwnsComputer,Telecom_Data$Churn)
str(OwnsComputer_Churn)
##  'table' int [1:2, 1:2] 29275 6690 11566 2631
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(OwnsComputer_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  OwnsComputer_Churn
## X-squared = 0.03, df = 1, p-value = 0.9
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.867290767438381"

#Creating a contingency table for HasCreditCard and Churn

HasCreditCard_Churn<-table(Telecom_Data$HasCreditCard,Telecom_Data$Churn)
str(HasCreditCard_Churn)
##  'table' int [1:2, 1:2] 11562 24403 4680 9517
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(HasCreditCard_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  HasCreditCard_Churn
## X-squared = 3, df = 1, p-value = 0.08
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.0800277134440576"

#Creating a contingency table for NewCellphoneUser and Churn

NewCellphoneUser_Churn<-table(Telecom_Data$NewCellphoneUser,Telecom_Data$Churn)
str(HasCreditCard_Churn)
##  'table' int [1:2, 1:2] 11562 24403 4680 9517
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(NewCellphoneUser_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  NewCellphoneUser_Churn
## X-squared = 2, df = 1, p-value = 0.1
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.137293233472184"

#Creating a contingency table for NotNewCellphoneUser and Churn

NotNewCellphoneUser_Churn<-table(Telecom_Data$NotNewCellphoneUser,Telecom_Data$Churn)
str(NotNewCellphoneUser_Churn)
##  'table' int [1:2, 1:2] 31042 4923 12202 1995
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(NotNewCellphoneUser_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  NotNewCellphoneUser_Churn
## X-squared = 1, df = 1, p-value = 0.3
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.293460939917897"

#Creating a contingency table for OwnsMotorcycle and Churn

OwnsMotorcycle_Churn<-table(Telecom_Data$OwnsMotorcycle,Telecom_Data$Churn)
str(OwnsMotorcycle_Churn)
##  'table' int [1:2, 1:2] 35503 462 13992 205
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(OwnsMotorcycle_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  OwnsMotorcycle_Churn
## X-squared = 2, df = 1, p-value = 0.2
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.173636753237995"

#Creating a contingency table for HandsetPrice and Churn

HandsetPrice_Churn<-table(Telecom_Data$HandsetPrice,Telecom_Data$Churn)
str(HandsetPrice_Churn)
##  'table' int [1:16, 1:2] 1408 898 1557 2990 7 908 5 11 5286 11 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:16] "10" "100" "130" "150" ...
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(HandsetPrice_Churn)
chisq_test
## 
##  Pearson's Chi-squared test
## 
## data:  HandsetPrice_Churn
## X-squared = 63, df = 15, p-value = 7e-08
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 6.75907520652387e-08"

#Creating a contingency table for MadeCallToRetentionTeam and Churn

MadeCallToRetentionTeam_Churn<-table(Telecom_Data$MadeCallToRetentionTeam,Telecom_Data$Churn)
str(MadeCallToRetentionTeam_Churn)
##  'table' int [1:2, 1:2] 35027 938 13453 744
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:2] "No" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(MadeCallToRetentionTeam_Churn)
chisq_test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  MadeCallToRetentionTeam_Churn
## X-squared = 217, df = 1, p-value <2e-16
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 4.39796505029558e-49"

#Creating a contingency table for CreditRating and Churn

CreditRating_Churn<-table(Telecom_Data$CreditRating,Telecom_Data$Churn)
str(CreditRating_Churn)
##  'table' int [1:7, 1:2] 5788 13189 5757 3923 5013 823 1472 2512 5565 2520 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:7] "1-Highest" "2-High" "3-Good" "4-Medium" ...
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(CreditRating_Churn)
chisq_test
## 
##  Pearson's Chi-squared test
## 
## data:  CreditRating_Churn
## X-squared = 220, df = 6, p-value <2e-16
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 1.0527771595973e-44"

#Creating a contingency table for PrizmCode and Churn

PrizmCode_Churn<-table(Telecom_Data$PrizmCode,Telecom_Data$Churn)
str(PrizmCode_Churn)
##  'table' int [1:4, 1:2] 17418 1640 11636 5271 6819 737 4452 2189
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:4] "Other" "Rural" "Suburban" "Town"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(PrizmCode_Churn)
chisq_test
## 
##  Pearson's Chi-squared test
## 
## data:  PrizmCode_Churn
## X-squared = 16, df = 3, p-value = 0.001
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.00112346149268779"

#Creating a contingency table for Occupation and Churn

Occupation_Churn<-table(Telecom_Data$Occupation,Telecom_Data$Churn)
str(Occupation_Churn)
##  'table' int [1:8, 1:2] 687 1081 104 26453 6218 530 631 261 284 412 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:8] "Clerical" "Crafts" "Homemaker" "Other" ...
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(Occupation_Churn)
chisq_test
## 
##  Pearson's Chi-squared test
## 
## data:  Occupation_Churn
## X-squared = 9, df = 7, p-value = 0.2
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.23771216205849"

#Creating a contingency table for MaritalStatus and Churn

MaritalStatus_Churn<-table(Telecom_Data$MaritalStatus,Telecom_Data$Churn)
str(MaritalStatus_Churn)
##  'table' int [1:3, 1:2] 9161 13616 13188 3324 5738 5135
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:3] "No" "Unknown" "Yes"
##   ..$ : chr [1:2] "No" "Yes"
chisq_test=chisq.test(MaritalStatus_Churn)
chisq_test
## 
##  Pearson's Chi-squared test
## 
## data:  MaritalStatus_Churn
## X-squared = 35, df = 2, p-value = 2e-08
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 2.17407648647709e-08"

#Rejected variables for which the p value is greater than 0.05 Service_Area HandsetWebCapable Marital Status Occupation Credit Rating MadeCallToRetentionTeam HandsetPrice OwnsMotorcycle NotNewCellphoneUser NewCellphoneUser OwnsComputer NonUSTravel OptOutMailings RespondsToMailOffers BuysViaMailOrder RVOwner TruckOwner HandsetWebCapable

#Selected variables for which the p value is lesser than 0.05 IncomeGroup The p value is: 0.000208431035771752 ChildrenInHH The p value is: 0.0316348562178264 Homeownership The p value is: 0.00303998368986854 PrizmCode The p value is: 0.000261117207513819

monthly_min_ANOVA=aov(MonthlyMinutes ~ Churn, data=Telecom_Data)
xkabledply(monthly_min_ANOVA)
Model: MonthlyMinutes ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 2.33e+07 23338054 83.3 0
Residuals 50160 1.41e+10 280316 NA NA
monthly_rev_ANOVA=aov(MonthlyRevenue ~ Churn, data=Telecom_Data)
xkabledply(monthly_rev_ANOVA)
Model: MonthlyRevenue ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 1553 1553 0.782 0.376
Residuals 50160 99608358 1986 NA NA
totalrec_charge_ANOVA=aov(TotalRecurringCharge ~ Churn, data=Telecom_Data)
xkabledply(totalrec_charge_ANOVA)
Model: TotalRecurringCharge ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 80954 80954 144 0
Residuals 50160 28156104 561 NA NA
director_assisted_ANOVA=aov(DirectorAssistedCalls ~ Churn, data=Telecom_Data)
xkabledply(director_assisted_ANOVA)
Model: DirectorAssistedCalls ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 61.2 61.17 12.2 5e-04
Residuals 50160 252087.6 5.03 NA NA
overage_min_ANOVA=aov(OverageMinutes ~ Churn, data=Telecom_Data)
xkabledply(overage_min_ANOVA)
Model: OverageMinutes ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 2.09e+05 209156 22.2 0
Residuals 50160 4.73e+08 9437 NA NA
roaming_calls_ANOVA=aov(RoamingCalls ~ Churn, data=Telecom_Data)
xkabledply(roaming_calls_ANOVA)
Model: RoamingCalls ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 746 745.6 7.63 0.0058
Residuals 50160 4903866 97.8 NA NA
dropped_calls_ANOVA=aov(DroppedCalls ~ Churn, data=Telecom_Data)
xkabledply(dropped_calls_ANOVA)
Model: DroppedCalls ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 265 264.7 3.21 0.0732
Residuals 50160 4136398 82.5 NA NA
blocked_calls_ANOVA=aov(BlockedCalls ~ Churn, data=Telecom_Data)
xkabledply(blocked_calls_ANOVA)
Model: BlockedCalls ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 1.08e+01 10.8 0.089 0.765
Residuals 50160 6.10e+06 121.7 NA NA
unanswered_calls_ANOVA=aov(UnansweredCalls  ~ Churn, data=Telecom_Data)
xkabledply(unanswered_calls_ANOVA)
Model: UnansweredCalls ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 35194 35194 23.1 0
Residuals 50160 76306982 1521 NA NA
cuscare_calls_ANOVA=aov(CustomerCareCalls  ~ Churn, data=Telecom_Data)
xkabledply(cuscare_calls_ANOVA)
Model: CustomerCareCalls ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 1330 1329.7 50.5 0
Residuals 50160 1320283 26.3 NA NA
monthsinservice_ANOVA=aov(MonthsInService  ~ Churn, data=Telecom_Data)
xkabledply(monthsinservice_ANOVA)
Model: MonthsInService ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 1248 1247.9 13.1 3e-04
Residuals 50160 4769719 95.1 NA NA
unique_subs_ANOVA=aov(UniqueSubs ~ Churn, data=Telecom_Data)
xkabledply(unique_subs_ANOVA)
Model: UniqueSubs ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 54.7 54.668 72.8 0
Residuals 50160 37674.8 0.751 NA NA
Active_Subs_ANOVA=aov(ActiveSubs ~ Churn, data=Telecom_Data)
xkabledply(Active_Subs_ANOVA)
Model: ActiveSubs ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 1.99 1.992 4.97 0.0258
Residuals 50160 20112.31 0.401 NA NA
current_eqp_ANOVA=aov(CurrentEquipmentDays ~ Churn, data=Telecom_Data)
xkabledply(current_eqp_ANOVA)
Model: CurrentEquipmentDays ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 3.16e+07 31557745 505 0
Residuals 50160 3.13e+09 62466 NA NA
AgeHH1_ANOVA=aov(AgeHH1 ~ Churn, data=Telecom_Data)
xkabledply(AgeHH1_ANOVA)
Model: AgeHH1 ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 4661 4661 43.1 0
Residuals 50160 5429490 108 NA NA
AgeHH2_ANOVA=aov(AgeHH2 ~ Churn, data=Telecom_Data)
xkabledply(AgeHH2_ANOVA)
Model: AgeHH2 ~ Churn
Df Sum Sq Mean Sq F value Pr(>F)
Churn 1 2216 2215.8 25.9 0
Residuals 50160 4293401 85.6 NA NA
#blocked_calls_ANOVA=aov(BlockedCalls ~ Churn, data=Telecom_Data)
#xkabledply(blocked_calls_ANOVA)

#blocked_calls_ANOVA=aov(BlockedCalls ~ Churn, data=Telecom_Data)
#xkabledply(blocked_calls_ANOVA)

#blocked_calls_ANOVA=aov(BlockedCalls ~ Churn, data=Telecom_Data)
#xkabledply(blocked_calls_ANOVA)

0.9 Removing Features which have chi square value more than 0.05 and anova for numerical features

select_feats = c("IncomeGroup","ChildrenInHH","Homeownership","PrizmCode","MonthlyMinutes", "MonthlyRevenue","TotalRecurringCharge","DirectorAssistedCalls","OverageMinutes","RoamingCalls","DroppedCalls","UnansweredCalls","CustomerCareCalls","MonthsInService","UniqueSubs","ActiveSubs","CurrentEquipmentDays","AgeHH1","AgeHH2","Churn")


Telecom_Data_rm <- subset(Telecom_Data, select =select_feats)
# Install the required package
#install.packages("fastDummies")
cat_cols <-c("IncomeGroup","ChildrenInHH","Homeownership","PrizmCode")

# Load the library
library(fastDummies)
prepro_data<-dummy_cols(Telecom_Data_rm,select_columns=cat_cols,remove_first_dummy = TRUE,remove_selected_columns=TRUE)  


prepro_data$Churn<-replace(prepro_data$Churn, prepro_data$Churn == "No", 0)
prepro_data$Churn<-replace(prepro_data$Churn, prepro_data$Churn == "Yes", 1)

## Converting factor to numeric 
prepro_data$Churn<-as.numeric(prepro_data$Churn)

0.10 Undersampling for imbalance dataset

#library(caret)
table(prepro_data$Churn)
## 
##     0     1 
## 35965 14197
churn_0 <- subset(prepro_data, prepro_data$Churn==0)
churn_1 <- subset(prepro_data, prepro_data$Churn==1)

nrow(churn_0)
## [1] 35965
nrow(churn_1)
## [1] 14197
set.seed(704)

churn_0_sampled = churn_0[ sample(nrow(churn_0),14197), ]

nrow(churn_0_sampled)
## [1] 14197
nrow(churn_1)
## [1] 14197
under_sampl= union(churn_0_sampled,churn_1)

library(caTools)


#make this example reproducible
set.seed(1)

#Use 80% of dataset as training set and remaining 20% as testing set
sample_under <- sample.split(under_sampl$Churn, SplitRatio = 0.8)
train_under  <- subset(under_sampl, sample_under == TRUE)
test_under   <- subset(under_sampl, sample_under == FALSE)

#view dimensions of training set
dim(train_under)
## [1] 22716    30
dim(test_under)
## [1] 5678   30
table(train_under$Churn)
## 
##     0     1 
## 11358 11358
table(test_under$Churn)
## 
##    0    1 
## 2839 2839

0.11 Predictive Modeling on Undersampled data

0.12 Random Forest

# Loading package
library(caTools)
library(randomForest)
library(caret)

train_under$Churn = factor(train_under$Churn,
                        levels = c(0, 1))

test_under$Churn = factor(test_under$Churn,
                        levels = c(0, 1))

x_train<-subset(train_under, select = -c(Churn))
y_train<-train_under$Churn

x_test <- subset(test_under, select = -c(Churn))
y_test <-test_under$Churn

#y_train<-factor(y_train)
#y_test<-factor(y_test)


classifier_RF = randomForest(x = x_train,
                             y = y_train,
                             ntree = 500)
  

  
# Predicting the Test set results
y_pred = predict(classifier_RF, newdata = x_test)
  
# Confusion Matrix
confusion_mtx = confusionMatrix(y_pred,y_test,mode = "everything",positive = "1")
print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1585 1041
##          1 1254 1798
##                                         
##                Accuracy : 0.596         
##                  95% CI : (0.583, 0.609)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : < 2e-16       
##                                         
##                   Kappa : 0.192         
##                                         
##  Mcnemar's Test P-Value : 9.63e-06      
##                                         
##             Sensitivity : 0.633         
##             Specificity : 0.558         
##          Pos Pred Value : 0.589         
##          Neg Pred Value : 0.604         
##               Precision : 0.589         
##                  Recall : 0.633         
##                      F1 : 0.610         
##              Prevalence : 0.500         
##          Detection Rate : 0.317         
##    Detection Prevalence : 0.538         
##       Balanced Accuracy : 0.596         
##                                         
##        'Positive' Class : 1             
## 
#confusion_mtx = table(y_test, y_pred)
#confusion_mtx
  
# Plotting model
plot(classifier_RF)

# Importance plot
importance(classifier_RF)
##                       MeanDecreaseGini
## MonthlyMinutes                  1049.7
## MonthlyRevenue                  1014.2
## TotalRecurringCharge             633.9
## DirectorAssistedCalls            484.9
## OverageMinutes                   630.8
## RoamingCalls                     422.9
## DroppedCalls                     765.1
## UnansweredCalls                  893.8
## CustomerCareCalls                446.6
## MonthsInService                  886.7
## UniqueSubs                       220.7
## ActiveSubs                       153.3
## CurrentEquipmentDays            1249.2
## AgeHH1                           619.6
## AgeHH2                           486.2
## IncomeGroup_1                     52.9
## IncomeGroup_2                     36.6
## IncomeGroup_3                     67.6
## IncomeGroup_4                     83.1
## IncomeGroup_5                     83.6
## IncomeGroup_6                    119.8
## IncomeGroup_7                     97.1
## IncomeGroup_8                     64.4
## IncomeGroup_9                     89.4
## ChildrenInHH_Yes                 128.3
## Homeownership_Unknown            123.8
## PrizmCode_Rural                   63.4
## PrizmCode_Suburban               149.5
## PrizmCode_Town                   119.6
# Variable importance plot
varImpPlot(classifier_RF)

model=glm(Churn ~ ., train_under, family = "binomial")
pred=predict(model,newdata=test_under,type="response")
pred[pred >= 0.5]=1         
pred[pred < 0.5 ]=0
pred=as.factor(pred)
summary(model)
## 
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = train_under)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.517  -1.148  -0.137   1.159   1.800  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.28e-01   1.10e-01    2.07  0.03810 *  
## MonthlyMinutes        -2.14e-04   4.97e-05   -4.31  1.6e-05 ***
## MonthlyRevenue         2.43e-03   1.00e-03    2.44  0.01489 *  
## TotalRecurringCharge  -4.31e-03   1.11e-03   -3.88  0.00011 ***
## DirectorAssistedCalls -1.18e-02   6.92e-03   -1.70  0.08866 .  
## OverageMinutes         6.67e-04   3.63e-04    1.84  0.06644 .  
## RoamingCalls           7.08e-03   2.63e-03    2.69  0.00706 ** 
## DroppedCalls           9.04e-03   1.96e-03    4.62  3.8e-06 ***
## UnansweredCalls        4.56e-04   4.80e-04    0.95  0.34127    
## CustomerCareCalls     -9.12e-03   3.42e-03   -2.67  0.00768 ** 
## MonthsInService       -8.74e-03   1.63e-03   -5.37  7.9e-08 ***
## UniqueSubs             2.24e-01   2.61e-02    8.59  < 2e-16 ***
## ActiveSubs            -1.98e-01   3.59e-02   -5.51  3.7e-08 ***
## CurrentEquipmentDays   1.12e-03   6.56e-05   17.11  < 2e-16 ***
## AgeHH1                -5.19e-03   1.53e-03   -3.39  0.00071 ***
## AgeHH2                -2.92e-03   1.70e-03   -1.72  0.08560 .  
## IncomeGroup_1         -1.18e-02   8.16e-02   -0.14  0.88515    
## IncomeGroup_2         -8.70e-02   9.96e-02   -0.87  0.38236    
## IncomeGroup_3         -2.83e-01   7.31e-02   -3.86  0.00011 ***
## IncomeGroup_4         -1.67e-01   6.84e-02   -2.44  0.01477 *  
## IncomeGroup_5         -2.08e-01   6.90e-02   -3.01  0.00259 ** 
## IncomeGroup_6         -2.08e-01   6.11e-02   -3.41  0.00066 ***
## IncomeGroup_7         -1.15e-01   6.64e-02   -1.73  0.08358 .  
## IncomeGroup_8         -2.07e-01   8.10e-02   -2.56  0.01053 *  
## IncomeGroup_9         -2.04e-01   6.98e-02   -2.93  0.00340 ** 
## ChildrenInHH_Yes       6.81e-02   3.43e-02    1.98  0.04717 *  
## Homeownership_Unknown -4.39e-02   5.08e-02   -0.87  0.38683    
## PrizmCode_Rural        1.38e-01   6.51e-02    2.12  0.03427 *  
## PrizmCode_Suburban    -4.43e-02   3.17e-02   -1.40  0.16164    
## PrizmCode_Town         1.76e-02   4.04e-02    0.44  0.66293    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 31491  on 22715  degrees of freedom
## Residual deviance: 30837  on 22686  degrees of freedom
## AIC: 30897
## 
## Number of Fisher Scoring iterations: 4
# Confusion Matrix
confusion_mtx = confusionMatrix(pred,test_under$Churn,mode = "everything",positive = "1")
print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1661 1316
##          1 1178 1523
##                                         
##                Accuracy : 0.561         
##                  95% CI : (0.548, 0.574)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : < 2e-16       
##                                         
##                   Kappa : 0.122         
##                                         
##  Mcnemar's Test P-Value : 0.00608       
##                                         
##             Sensitivity : 0.536         
##             Specificity : 0.585         
##          Pos Pred Value : 0.564         
##          Neg Pred Value : 0.558         
##               Precision : 0.564         
##                  Recall : 0.536         
##                      F1 : 0.550         
##              Prevalence : 0.500         
##          Detection Rate : 0.268         
##    Detection Prevalence : 0.476         
##       Balanced Accuracy : 0.561         
##                                         
##        'Positive' Class : 1             
## 

0.12.1 KNN

library("class")

chooseK = function(k, train_set, val_set, train_class, val_class){
  
  # Build knn with k neighbors considered.
  set.seed(1)
  class_knn = knn(train = train_set,    #<- training set cases
                  test = val_set,       #<- test set cases
                  cl = train_class,     #<- category for classification
                  k = k) #,                #<- number of neighbors considered
                  # use.all = TRUE)       #<- control ties between class assignments. If true, all distances equal to the k-th largest are included
  
  tab = table(class_knn, val_class)
  
  # Calculate the accuracy.
  accu = sum(tab[row(tab) == col(tab)]) / sum(tab)                         
  cbind(k = k, accuracy = accu)
}

knn_different_k = sapply(seq(1, 21, by = 2),  #<- set k to be odd number from 1 to 21
                         function(x) chooseK(x, train_set = x_train,val_set = x_test,train_class =y_train,val_class = y_test))
str(knn_different_k)
##  num [1:2, 1:11] 1 0.528 3 0.533 5 ...
knn_different_k = data.frame(k = knn_different_k[1,],
                             accuracy = knn_different_k[2,])


loadPkg("ggplot2")
ggplot(knn_different_k,
       aes(x = k, y = accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3) + 
  labs(title = "accuracy vs k")

cl = train_under[,1]
classifier_knn2 <- knn(train = train_under,
                      test = test_under,
                      cl = train_under$Churn,
                      k = 17,prob=TRUE)

confusion_mtx = confusionMatrix(classifier_knn2,as.factor(test_under$Churn),mode = "everything",positive = "1")
print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1572 1271
##          1 1267 1568
##                                        
##                Accuracy : 0.553        
##                  95% CI : (0.54, 0.566)
##     No Information Rate : 0.5          
##     P-Value [Acc > NIR] : 7.13e-16     
##                                        
##                   Kappa : 0.106        
##                                        
##  Mcnemar's Test P-Value : 0.953        
##                                        
##             Sensitivity : 0.552        
##             Specificity : 0.554        
##          Pos Pred Value : 0.553        
##          Neg Pred Value : 0.553        
##               Precision : 0.553        
##                  Recall : 0.552        
##                      F1 : 0.553        
##              Prevalence : 0.500        
##          Detection Rate : 0.276        
##    Detection Prevalence : 0.499        
##       Balanced Accuracy : 0.553        
##                                        
##        'Positive' Class : 1            
## 

##Naive Bayes

library(naivebayes)

model_nb <- naive_bayes(Churn ~ ., data = train_under) 
summary(model_nb)
## 
## ================================== Naive Bayes ================================== 
##  
## - Call: naive_bayes.formula(formula = Churn ~ ., data = train_under) 
## - Laplace: 0 
## - Classes: 2 
## - Samples: 22716 
## - Features: 29 
## - Conditional distributions: 
##     - Gaussian: 29
## - Prior probabilities: 
##     - 0: 0.5
##     - 1: 0.5
## 
## ---------------------------------------------------------------------------------
pred=predict(model_nb,newdata=x_test,type="class")

summary(model)
## 
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = train_under)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.517  -1.148  -0.137   1.159   1.800  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.28e-01   1.10e-01    2.07  0.03810 *  
## MonthlyMinutes        -2.14e-04   4.97e-05   -4.31  1.6e-05 ***
## MonthlyRevenue         2.43e-03   1.00e-03    2.44  0.01489 *  
## TotalRecurringCharge  -4.31e-03   1.11e-03   -3.88  0.00011 ***
## DirectorAssistedCalls -1.18e-02   6.92e-03   -1.70  0.08866 .  
## OverageMinutes         6.67e-04   3.63e-04    1.84  0.06644 .  
## RoamingCalls           7.08e-03   2.63e-03    2.69  0.00706 ** 
## DroppedCalls           9.04e-03   1.96e-03    4.62  3.8e-06 ***
## UnansweredCalls        4.56e-04   4.80e-04    0.95  0.34127    
## CustomerCareCalls     -9.12e-03   3.42e-03   -2.67  0.00768 ** 
## MonthsInService       -8.74e-03   1.63e-03   -5.37  7.9e-08 ***
## UniqueSubs             2.24e-01   2.61e-02    8.59  < 2e-16 ***
## ActiveSubs            -1.98e-01   3.59e-02   -5.51  3.7e-08 ***
## CurrentEquipmentDays   1.12e-03   6.56e-05   17.11  < 2e-16 ***
## AgeHH1                -5.19e-03   1.53e-03   -3.39  0.00071 ***
## AgeHH2                -2.92e-03   1.70e-03   -1.72  0.08560 .  
## IncomeGroup_1         -1.18e-02   8.16e-02   -0.14  0.88515    
## IncomeGroup_2         -8.70e-02   9.96e-02   -0.87  0.38236    
## IncomeGroup_3         -2.83e-01   7.31e-02   -3.86  0.00011 ***
## IncomeGroup_4         -1.67e-01   6.84e-02   -2.44  0.01477 *  
## IncomeGroup_5         -2.08e-01   6.90e-02   -3.01  0.00259 ** 
## IncomeGroup_6         -2.08e-01   6.11e-02   -3.41  0.00066 ***
## IncomeGroup_7         -1.15e-01   6.64e-02   -1.73  0.08358 .  
## IncomeGroup_8         -2.07e-01   8.10e-02   -2.56  0.01053 *  
## IncomeGroup_9         -2.04e-01   6.98e-02   -2.93  0.00340 ** 
## ChildrenInHH_Yes       6.81e-02   3.43e-02    1.98  0.04717 *  
## Homeownership_Unknown -4.39e-02   5.08e-02   -0.87  0.38683    
## PrizmCode_Rural        1.38e-01   6.51e-02    2.12  0.03427 *  
## PrizmCode_Suburban    -4.43e-02   3.17e-02   -1.40  0.16164    
## PrizmCode_Town         1.76e-02   4.04e-02    0.44  0.66293    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 31491  on 22715  degrees of freedom
## Residual deviance: 30837  on 22686  degrees of freedom
## AIC: 30897
## 
## Number of Fisher Scoring iterations: 4
# Confusion Matrix
confusion_mtx = confusionMatrix(pred,test_under$Churn,mode = "everything",positive = "1")
print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1921 1685
##          1  918 1154
##                                         
##                Accuracy : 0.542         
##                  95% CI : (0.528, 0.555)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : 2e-10         
##                                         
##                   Kappa : 0.083         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.406         
##             Specificity : 0.677         
##          Pos Pred Value : 0.557         
##          Neg Pred Value : 0.533         
##               Precision : 0.557         
##                  Recall : 0.406         
##                      F1 : 0.470         
##              Prevalence : 0.500         
##          Detection Rate : 0.203         
##    Detection Prevalence : 0.365         
##       Balanced Accuracy : 0.542         
##                                         
##        'Positive' Class : 1             
## 

0.13 Decision Tree

library(tree)
classifier = tree(formula = Churn ~ .,
                data = train_under)

# Predicting the Test set results
y_pred = predict(classifier,
                newdata = test_under,
                type = 'class')

# Making the Confusion Matrix
confusion_mtx = confusionMatrix(y_pred,test_under$Churn,mode = "everything",positive = "1")

print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1366  919
##          1 1473 1920
##                                         
##                Accuracy : 0.579         
##                  95% CI : (0.566, 0.592)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.157         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.676         
##             Specificity : 0.481         
##          Pos Pred Value : 0.566         
##          Neg Pred Value : 0.598         
##               Precision : 0.566         
##                  Recall : 0.676         
##                      F1 : 0.616         
##              Prevalence : 0.500         
##          Detection Rate : 0.338         
##    Detection Prevalence : 0.598         
##       Balanced Accuracy : 0.579         
##                                         
##        'Positive' Class : 1             
## 

Lets Try out SMOTE Technique, and lets compare the results

library(smotefamily)
x<-subset(prepro_data, select = -c(Churn))
y<-prepro_data$Churn
#smote = SMOTE(x,y)
#x<-as.numeric(x)
#y<-as.numeric(y)
smote = SMOTE(x, y,6)
smote_complete = smote$data
str(smote_complete)
## 'data.frame':    64359 obs. of  30 variables:
##  $ MonthlyMinutes       : num  1143 884 62 49 796 ...
##  $ MonthlyRevenue       : num  114.9 90.9 51 22.8 72.5 ...
##  $ TotalRecurringCharge : num  45 60 70 36 45 28 82 60 30 50 ...
##  $ DirectorAssistedCalls: num  1.24 1.98 0.99 0 2.23 0.74 4.46 1.73 0.25 0.25 ...
##  $ OverageMinutes       : num  214 127 0 1 100 7 0 1 79 14 ...
##  $ RoamingCalls         : num  0 0.8 0 0 0 0.5 0 19.5 0 6.7 ...
##  $ DroppedCalls         : num  10.7 3.3 2 1 8 7 16.7 3 4 4 ...
##  $ UnansweredCalls      : num  106.7 46.3 6.7 5.7 73.7 ...
##  $ CustomerCareCalls    : num  4.3 0.3 0 0 0 1 0 0 2.3 2.3 ...
##  $ MonthsInService      : num  15 25 6 11 7 11 29 13 26 11 ...
##  $ UniqueSubs           : num  3 1 1 2 1 1 2 1 1 1 ...
##  $ ActiveSubs           : num  1 1 1 2 1 1 2 1 1 1 ...
##  $ CurrentEquipmentDays : num  84 748 190 326 202 52 183 392 788 313 ...
##  $ AgeHH1               : num  52 42 42 52 40 50 48 32 30 46 ...
##  $ AgeHH2               : num  32 44 44 52 20 48 46 44 44 44 ...
##  $ IncomeGroup_1        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IncomeGroup_2        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IncomeGroup_3        : num  0 0 0 0 0 1 0 0 0 1 ...
##  $ IncomeGroup_4        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IncomeGroup_5        : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ IncomeGroup_6        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ IncomeGroup_7        : num  1 0 0 1 0 0 1 0 0 0 ...
##  $ IncomeGroup_8        : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ IncomeGroup_9        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ChildrenInHH_Yes     : num  1 0 0 1 1 0 0 0 0 1 ...
##  $ Homeownership_Unknown: num  0 1 1 0 0 0 0 0 1 0 ...
##  $ PrizmCode_Rural      : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ PrizmCode_Suburban   : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ PrizmCode_Town       : num  0 0 0 0 1 0 0 0 0 1 ...
##  $ class                : chr  "1" "1" "1" "1" ...
#write.csv(smote_complete, "D:\\ID_Project\\T9-Outlier-22FA\\Data Preprocessing\\smote_complete.csv", row.names=FALSE)

#write.csv(test, "D:\\ID_Project\\T9-Outlier-22FA\\Data Preprocessing\\test.csv", row.names=FALSE)
#Use 80% of dataset as training set and remaining 20% as testing set
sample <- sample.split(smote_complete$class, SplitRatio = 0.8)
train_smote<- subset(smote_complete, sample == TRUE)
test_smote   <- subset(smote_complete, sample == FALSE)

0.14 Random Forest

# Loading package
library(caTools)
library(randomForest)
library(caret)

train_smote$class = factor(train_smote$class,
                        levels = c(0, 1))

test_smote$class = factor(test_smote$class,
                        levels = c(0, 1))

x_train<-subset(train_smote, select = -c(class))
y_train<-train_smote$class

x_test <- subset(test_smote, select = -c(class))
y_test <-test_smote$class

#y_train<-factor(y_train)
#y_test<-factor(y_test)


classifier_RF = randomForest(x = x_train,
                             y = y_train,
                             ntree = 500)
  

  
# Predicting the Test set results
y_pred = predict(classifier_RF, newdata = x_test)
  
# Confusion Matrix
confusion_mtx = confusionMatrix(y_pred,y_test,mode = "everything",positive = "1")
print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6987 2574
##          1  206 3105
##                                         
##                Accuracy : 0.784         
##                  95% CI : (0.777, 0.791)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.542         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.547         
##             Specificity : 0.971         
##          Pos Pred Value : 0.938         
##          Neg Pred Value : 0.731         
##               Precision : 0.938         
##                  Recall : 0.547         
##                      F1 : 0.691         
##              Prevalence : 0.441         
##          Detection Rate : 0.241         
##    Detection Prevalence : 0.257         
##       Balanced Accuracy : 0.759         
##                                         
##        'Positive' Class : 1             
## 
#confusion_mtx = table(y_test, y_pred)
#confusion_mtx
  
# Plotting model
plot(classifier_RF)

# Importance plot
importance(classifier_RF)
##                       MeanDecreaseGini
## MonthlyMinutes                  1705.4
## MonthlyRevenue                  1598.2
## TotalRecurringCharge            1151.1
## DirectorAssistedCalls            976.9
## OverageMinutes                  1057.3
## RoamingCalls                     772.8
## DroppedCalls                    1296.4
## UnansweredCalls                 1480.0
## CustomerCareCalls                833.1
## MonthsInService                 1748.1
## UniqueSubs                      1316.1
## ActiveSubs                       784.5
## CurrentEquipmentDays            2070.3
## AgeHH1                          1100.3
## AgeHH2                           842.6
## IncomeGroup_1                    113.6
## IncomeGroup_2                     67.8
## IncomeGroup_3                    159.4
## IncomeGroup_4                    199.9
## IncomeGroup_5                    220.9
## IncomeGroup_6                    673.9
## IncomeGroup_7                    306.3
## IncomeGroup_8                    128.0
## IncomeGroup_9                    323.0
## ChildrenInHH_Yes                 905.6
## Homeownership_Unknown           1233.9
## PrizmCode_Rural                  125.1
## PrizmCode_Suburban              1136.6
## PrizmCode_Town                   429.2
# Variable importance plot
varImpPlot(classifier_RF)

print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6987 2574
##          1  206 3105
##                                         
##                Accuracy : 0.784         
##                  95% CI : (0.777, 0.791)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.542         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.547         
##             Specificity : 0.971         
##          Pos Pred Value : 0.938         
##          Neg Pred Value : 0.731         
##               Precision : 0.938         
##                  Recall : 0.547         
##                      F1 : 0.691         
##              Prevalence : 0.441         
##          Detection Rate : 0.241         
##    Detection Prevalence : 0.257         
##       Balanced Accuracy : 0.759         
##                                         
##        'Positive' Class : 1             
## 
model=glm(class ~ ., train_smote, family = "binomial")
pred=predict(model,newdata=test_smote,type="response")
pred[pred >= 0.5]=1         
pred[pred < 0.5 ]=0
pred=as.factor(pred)
summary(model)
## 
## Call:
## glm(formula = class ~ ., family = "binomial", data = train_smote)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -2.25   -1.08   -0.89    1.23    2.21  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.44e-01   7.62e-02    3.20  0.00139 ** 
## MonthlyMinutes        -1.56e-04   3.35e-05   -4.68  2.9e-06 ***
## MonthlyRevenue         2.74e-03   7.19e-04    3.81  0.00014 ***
## TotalRecurringCharge  -5.43e-03   7.82e-04   -6.93  4.1e-12 ***
## DirectorAssistedCalls -1.76e-02   5.20e-03   -3.38  0.00073 ***
## OverageMinutes         3.82e-04   2.53e-04    1.51  0.13093    
## RoamingCalls          -4.31e-04   1.28e-03   -0.34  0.73734    
## DroppedCalls           6.45e-03   1.36e-03    4.75  2.1e-06 ***
## UnansweredCalls        5.03e-04   3.37e-04    1.49  0.13553    
## CustomerCareCalls     -1.29e-02   2.44e-03   -5.28  1.3e-07 ***
## MonthsInService       -1.17e-02   1.14e-03  -10.24  < 2e-16 ***
## UniqueSubs             2.28e-01   1.81e-02   12.56  < 2e-16 ***
## ActiveSubs            -2.41e-01   2.50e-02   -9.66  < 2e-16 ***
## CurrentEquipmentDays   1.09e-03   4.41e-05   24.73  < 2e-16 ***
## AgeHH1                -7.26e-03   1.06e-03   -6.83  8.5e-12 ***
## AgeHH2                -2.47e-03   1.19e-03   -2.07  0.03817 *  
## IncomeGroup_1         -1.45e-01   5.57e-02   -2.59  0.00950 ** 
## IncomeGroup_2         -1.40e-01   6.94e-02   -2.02  0.04342 *  
## IncomeGroup_3         -3.17e-01   5.16e-02   -6.14  8.2e-10 ***
## IncomeGroup_4         -2.50e-01   4.80e-02   -5.21  1.9e-07 ***
## IncomeGroup_5         -2.73e-01   4.84e-02   -5.64  1.7e-08 ***
## IncomeGroup_6         -2.57e-01   4.25e-02   -6.04  1.6e-09 ***
## IncomeGroup_7         -1.63e-01   4.65e-02   -3.51  0.00044 ***
## IncomeGroup_8         -3.00e-01   5.59e-02   -5.36  8.5e-08 ***
## IncomeGroup_9         -2.87e-01   4.86e-02   -5.91  3.4e-09 ***
## ChildrenInHH_Yes       1.22e-01   2.39e-02    5.09  3.5e-07 ***
## Homeownership_Unknown  8.58e-03   3.56e-02    0.24  0.80955    
## PrizmCode_Rural        1.00e-01   4.50e-02    2.23  0.02551 *  
## PrizmCode_Suburban    -3.38e-02   2.20e-02   -1.54  0.12404    
## PrizmCode_Town         4.20e-02   2.82e-02    1.49  0.13621    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70662  on 51486  degrees of freedom
## Residual deviance: 69239  on 51457  degrees of freedom
## AIC: 69299
## 
## Number of Fisher Scoring iterations: 4
# Confusion Matrix
confusion_mtx = confusionMatrix(pred,test_smote$class,mode = "everything",positive = "1")
print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5842 4069
##          1 1351 1610
##                                        
##                Accuracy : 0.579        
##                  95% CI : (0.57, 0.587)
##     No Information Rate : 0.559        
##     P-Value [Acc > NIR] : 2.15e-06     
##                                        
##                   Kappa : 0.101        
##                                        
##  Mcnemar's Test P-Value : < 2e-16      
##                                        
##             Sensitivity : 0.284        
##             Specificity : 0.812        
##          Pos Pred Value : 0.544        
##          Neg Pred Value : 0.589        
##               Precision : 0.544        
##                  Recall : 0.284        
##                      F1 : 0.373        
##              Prevalence : 0.441        
##          Detection Rate : 0.125        
##    Detection Prevalence : 0.230        
##       Balanced Accuracy : 0.548        
##                                        
##        'Positive' Class : 1            
## 

0.14.1 KNN

library("class")

chooseK = function(k, train_set, val_set, train_class, val_class){
  
  # Build knn with k neighbors considered.
  set.seed(1)
  class_knn = knn(train = train_set,    #<- training set cases
                  test = val_set,       #<- test set cases
                  cl = train_class,     #<- category for classification
                  k = k) #,                #<- number of neighbors considered
                  # use.all = TRUE)       #<- control ties between class assignments. If true, all distances equal to the k-th largest are included
  
  tab = table(class_knn, val_class)
  
  # Calculate the accuracy.
  accu = sum(tab[row(tab) == col(tab)]) / sum(tab)                         
  cbind(k = k, accuracy = accu)
}

knn_different_k = sapply(seq(1, 21, by = 2),  #<- set k to be odd number from 1 to 21
                         function(x) chooseK(x, train_set = x_train,val_set = x_test,train_class =y_train,val_class = y_test))
str(knn_different_k)
##  num [1:2, 1:11] 1 0.752 3 0.677 5 ...
knn_different_k = data.frame(k = knn_different_k[1,],
                             accuracy = knn_different_k[2,])


loadPkg("ggplot2")
ggplot(knn_different_k,
       aes(x = k, y = accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3) + 
  labs(title = "accuracy vs k")

cl = train_smote[,1]
classifier_knn2 <- knn(train = train_smote,
                      test = test_smote,
                      cl = train_smote$class,
                      k = 3,prob=TRUE)

confusion_mtx = confusionMatrix(classifier_knn2,as.factor(test_smote$class),mode = "everything",positive = "1")
print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4424 1375
##          1 2769 4304
##                                        
##                Accuracy : 0.678        
##                  95% CI : (0.67, 0.686)
##     No Information Rate : 0.559        
##     P-Value [Acc > NIR] : <2e-16       
##                                        
##                   Kappa : 0.364        
##                                        
##  Mcnemar's Test P-Value : <2e-16       
##                                        
##             Sensitivity : 0.758        
##             Specificity : 0.615        
##          Pos Pred Value : 0.609        
##          Neg Pred Value : 0.763        
##               Precision : 0.609        
##                  Recall : 0.758        
##                      F1 : 0.675        
##              Prevalence : 0.441        
##          Detection Rate : 0.334        
##    Detection Prevalence : 0.549        
##       Balanced Accuracy : 0.686        
##                                        
##        'Positive' Class : 1            
## 

##Naive Bayes

library(naivebayes)

model_nb <- naive_bayes(class ~ ., data = train_smote) 
summary(model_nb)
## 
## ================================== Naive Bayes ================================== 
##  
## - Call: naive_bayes.formula(formula = class ~ ., data = train_smote) 
## - Laplace: 0 
## - Classes: 2 
## - Samples: 51487 
## - Features: 29 
## - Conditional distributions: 
##     - Gaussian: 29
## - Prior probabilities: 
##     - 0: 0.5588
##     - 1: 0.4412
## 
## ---------------------------------------------------------------------------------
pred=predict(model_nb,newdata=x_test,type="class")

summary(model)
## 
## Call:
## glm(formula = class ~ ., family = "binomial", data = train_smote)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -2.25   -1.08   -0.89    1.23    2.21  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.44e-01   7.62e-02    3.20  0.00139 ** 
## MonthlyMinutes        -1.56e-04   3.35e-05   -4.68  2.9e-06 ***
## MonthlyRevenue         2.74e-03   7.19e-04    3.81  0.00014 ***
## TotalRecurringCharge  -5.43e-03   7.82e-04   -6.93  4.1e-12 ***
## DirectorAssistedCalls -1.76e-02   5.20e-03   -3.38  0.00073 ***
## OverageMinutes         3.82e-04   2.53e-04    1.51  0.13093    
## RoamingCalls          -4.31e-04   1.28e-03   -0.34  0.73734    
## DroppedCalls           6.45e-03   1.36e-03    4.75  2.1e-06 ***
## UnansweredCalls        5.03e-04   3.37e-04    1.49  0.13553    
## CustomerCareCalls     -1.29e-02   2.44e-03   -5.28  1.3e-07 ***
## MonthsInService       -1.17e-02   1.14e-03  -10.24  < 2e-16 ***
## UniqueSubs             2.28e-01   1.81e-02   12.56  < 2e-16 ***
## ActiveSubs            -2.41e-01   2.50e-02   -9.66  < 2e-16 ***
## CurrentEquipmentDays   1.09e-03   4.41e-05   24.73  < 2e-16 ***
## AgeHH1                -7.26e-03   1.06e-03   -6.83  8.5e-12 ***
## AgeHH2                -2.47e-03   1.19e-03   -2.07  0.03817 *  
## IncomeGroup_1         -1.45e-01   5.57e-02   -2.59  0.00950 ** 
## IncomeGroup_2         -1.40e-01   6.94e-02   -2.02  0.04342 *  
## IncomeGroup_3         -3.17e-01   5.16e-02   -6.14  8.2e-10 ***
## IncomeGroup_4         -2.50e-01   4.80e-02   -5.21  1.9e-07 ***
## IncomeGroup_5         -2.73e-01   4.84e-02   -5.64  1.7e-08 ***
## IncomeGroup_6         -2.57e-01   4.25e-02   -6.04  1.6e-09 ***
## IncomeGroup_7         -1.63e-01   4.65e-02   -3.51  0.00044 ***
## IncomeGroup_8         -3.00e-01   5.59e-02   -5.36  8.5e-08 ***
## IncomeGroup_9         -2.87e-01   4.86e-02   -5.91  3.4e-09 ***
## ChildrenInHH_Yes       1.22e-01   2.39e-02    5.09  3.5e-07 ***
## Homeownership_Unknown  8.58e-03   3.56e-02    0.24  0.80955    
## PrizmCode_Rural        1.00e-01   4.50e-02    2.23  0.02551 *  
## PrizmCode_Suburban    -3.38e-02   2.20e-02   -1.54  0.12404    
## PrizmCode_Town         4.20e-02   2.82e-02    1.49  0.13621    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70662  on 51486  degrees of freedom
## Residual deviance: 69239  on 51457  degrees of freedom
## AIC: 69299
## 
## Number of Fisher Scoring iterations: 4
# Confusion Matrix
confusion_mtx = confusionMatrix(pred,test_smote$class,mode = "everything",positive = "1")
print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3001 1672
##          1 4192 4007
##                                         
##                Accuracy : 0.544         
##                  95% CI : (0.536, 0.553)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : 0.999         
##                                         
##                   Kappa : 0.117         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.706         
##             Specificity : 0.417         
##          Pos Pred Value : 0.489         
##          Neg Pred Value : 0.642         
##               Precision : 0.489         
##                  Recall : 0.706         
##                      F1 : 0.577         
##              Prevalence : 0.441         
##          Detection Rate : 0.311         
##    Detection Prevalence : 0.637         
##       Balanced Accuracy : 0.561         
##                                         
##        'Positive' Class : 1             
## 

0.15 Decision Tree

library(tree)
classifier = tree(formula = class ~ .,
                data = train_smote)

# Predicting the Test set results
y_pred = predict(classifier,
                newdata = test_smote,
                type = 'class')

# Making the Confusion Matrix
confusion_mtx = confusionMatrix(y_pred,test_smote$class,mode = "everything",positive = "1")

print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7193 4581
##          1    0 1098
##                                         
##                Accuracy : 0.644         
##                  95% CI : (0.636, 0.652)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.211         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.1933        
##             Specificity : 1.0000        
##          Pos Pred Value : 1.0000        
##          Neg Pred Value : 0.6109        
##               Precision : 1.0000        
##                  Recall : 0.1933        
##                      F1 : 0.3240        
##              Prevalence : 0.4412        
##          Detection Rate : 0.0853        
##    Detection Prevalence : 0.0853        
##       Balanced Accuracy : 0.5967        
##                                         
##        'Positive' Class : 1             
## 

0.15.1 Trying different threshold values to improve recall/sensitivity

thresh <- c(0.3, 0.35 ,0.4 ,0.45, 0.5, 0.55, 0.60,0.65)

table(y_test)
## y_test
##    0    1 
## 7193 5679
y_pred = predict(classifier_RF, newdata = x_test,type ="prob")
y_prob =y_pred[,2]

y_test=as.numeric(y_test)
y_test[y_test == 1] <-  "No"             
y_test[y_test == 2] <-   "Yes"
  
y_test <- as.factor(y_test)


for (x in thresh) {
  print("Threshold")
  print(x)
  y_pred = predict(classifier_RF, newdata = x_test,type ="prob")
  y_prob =y_pred[,2]

  y_prob[y_prob >= x] <-  "Yes"             
  y_prob[y_prob < x] <-   "No"
  
  
  
  
  y_prob <- as.factor(y_prob)
  confusion_mtx = confusionMatrix(y_prob,y_test,mode = "everything",positive="Yes")
  print(confusion_mtx)
  

  
  
}
## [1] "Threshold"
## [1] 0.3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  3757  803
##        Yes 3436 4876
##                                         
##                Accuracy : 0.671         
##                  95% CI : (0.662, 0.679)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.363         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.859         
##             Specificity : 0.522         
##          Pos Pred Value : 0.587         
##          Neg Pred Value : 0.824         
##               Precision : 0.587         
##                  Recall : 0.859         
##                      F1 : 0.697         
##              Prevalence : 0.441         
##          Detection Rate : 0.379         
##    Detection Prevalence : 0.646         
##       Balanced Accuracy : 0.690         
##                                         
##        'Positive' Class : Yes           
##                                         
## [1] "Threshold"
## [1] 0.35
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  5008 1338
##        Yes 2185 4341
##                                         
##                Accuracy : 0.726         
##                  95% CI : (0.719, 0.734)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.454         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.764         
##             Specificity : 0.696         
##          Pos Pred Value : 0.665         
##          Neg Pred Value : 0.789         
##               Precision : 0.665         
##                  Recall : 0.764         
##                      F1 : 0.711         
##              Prevalence : 0.441         
##          Detection Rate : 0.337         
##    Detection Prevalence : 0.507         
##       Balanced Accuracy : 0.730         
##                                         
##        'Positive' Class : Yes           
##                                         
## [1] "Threshold"
## [1] 0.4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  6020 1867
##        Yes 1173 3812
##                                         
##                Accuracy : 0.764         
##                  95% CI : (0.756, 0.771)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.515         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.671         
##             Specificity : 0.837         
##          Pos Pred Value : 0.765         
##          Neg Pred Value : 0.763         
##               Precision : 0.765         
##                  Recall : 0.671         
##                      F1 : 0.715         
##              Prevalence : 0.441         
##          Detection Rate : 0.296         
##    Detection Prevalence : 0.387         
##       Balanced Accuracy : 0.754         
##                                         
##        'Positive' Class : Yes           
##                                         
## [1] "Threshold"
## [1] 0.45
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  6614 2281
##        Yes  579 3398
##                                         
##                Accuracy : 0.778         
##                  95% CI : (0.771, 0.785)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.535         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.598         
##             Specificity : 0.920         
##          Pos Pred Value : 0.854         
##          Neg Pred Value : 0.744         
##               Precision : 0.854         
##                  Recall : 0.598         
##                      F1 : 0.704         
##              Prevalence : 0.441         
##          Detection Rate : 0.264         
##    Detection Prevalence : 0.309         
##       Balanced Accuracy : 0.759         
##                                         
##        'Positive' Class : Yes           
##                                         
## [1] "Threshold"
## [1] 0.5
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  6983 2570
##        Yes  210 3109
##                                         
##                Accuracy : 0.784         
##                  95% CI : (0.777, 0.791)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.542         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.547         
##             Specificity : 0.971         
##          Pos Pred Value : 0.937         
##          Neg Pred Value : 0.731         
##               Precision : 0.937         
##                  Recall : 0.547         
##                      F1 : 0.691         
##              Prevalence : 0.441         
##          Detection Rate : 0.242         
##    Detection Prevalence : 0.258         
##       Balanced Accuracy : 0.759         
##                                         
##        'Positive' Class : Yes           
##                                         
## [1] "Threshold"
## [1] 0.55
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  7109 2759
##        Yes   84 2920
##                                         
##                Accuracy : 0.779         
##                  95% CI : (0.772, 0.786)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.529         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.514         
##             Specificity : 0.988         
##          Pos Pred Value : 0.972         
##          Neg Pred Value : 0.720         
##               Precision : 0.972         
##                  Recall : 0.514         
##                      F1 : 0.673         
##              Prevalence : 0.441         
##          Detection Rate : 0.227         
##    Detection Prevalence : 0.233         
##       Balanced Accuracy : 0.751         
##                                         
##        'Positive' Class : Yes           
##                                         
## [1] "Threshold"
## [1] 0.6
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  7165 2869
##        Yes   28 2810
##                                         
##                Accuracy : 0.775         
##                  95% CI : (0.768, 0.782)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.518         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.495         
##             Specificity : 0.996         
##          Pos Pred Value : 0.990         
##          Neg Pred Value : 0.714         
##               Precision : 0.990         
##                  Recall : 0.495         
##                      F1 : 0.660         
##              Prevalence : 0.441         
##          Detection Rate : 0.218         
##    Detection Prevalence : 0.220         
##       Balanced Accuracy : 0.745         
##                                         
##        'Positive' Class : Yes           
##                                         
## [1] "Threshold"
## [1] 0.65
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  7184 2946
##        Yes    9 2733
##                                         
##                Accuracy : 0.77          
##                  95% CI : (0.763, 0.778)
##     No Information Rate : 0.559         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.508         
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.481         
##             Specificity : 0.999         
##          Pos Pred Value : 0.997         
##          Neg Pred Value : 0.709         
##               Precision : 0.997         
##                  Recall : 0.481         
##                      F1 : 0.649         
##              Prevalence : 0.441         
##          Detection Rate : 0.212         
##    Detection Prevalence : 0.213         
##       Balanced Accuracy : 0.740         
##                                         
##        'Positive' Class : Yes           
## 
thresh <- c(0.3, 0.35,0.40,0.45)
Accuracy <- c(0.67,0.72,0.76,0.77)
Recall <- c(0.85,0.75,0.66,0.58)

metrics <- data.frame(thresh, Accuracy,Recall)

library(plotly)
t <- list(
  family = "sans serif",
  size = 14,
  color = toRGB("grey50"))


fig <- plot_ly(metrics, x = ~Accuracy, y = ~Recall, text = ~thresh)
fig <- fig %>% add_markers()
fig <- fig %>% add_text(textfont = t, textposition = "top right")


fig

0.16 Model interpretability

i_scores <- varImp(classifier_RF, conditional=TRUE)
#Gathering rownames in 'var'  and converting it to the factor
#to provide 'fill' parameter for the bar chart. 
i_scores <- i_scores %>% tibble::rownames_to_column("var") 
i_scores$var<- i_scores$var %>% as.factor()

plot_ly(
  data = i_scores,
  x = ~var,
  y = ~Overall,
  type = "bar"
) %>% 
layout(xaxis = list(categoryorder = "total descending"))

Let’s check high risky customers which have high probablity of churning

y_pred = predict(classifier_RF, newdata = x_test,type ="prob")
y_prob=y_pred[,2]

print("Probablity of churning greater than 0.80")
## [1] "Probablity of churning greater than 0.80"
print("High Risky customers count")
## [1] "High Risky customers count"
print(length(y_prob[y_prob >= 0.80]))
## [1] 2419
print(" Probablity of Churning between 0.60 to 0.80")
## [1] " Probablity of Churning between 0.60 to 0.80"
print("moderate Risky customers count")
## [1] "moderate Risky customers count"
print(length(y_prob[y_prob >= 0.60]) - length(y_prob[y_prob >= 0.80]))
## [1] 419